From 633efdc305f004fcc807f3eb9b96bcfde2f35cda93b2485157bbe70e75125afe Mon Sep 17 00:00:00 2001 From: Tom Foster Date: Sat, 9 Aug 2025 10:55:42 +0100 Subject: [PATCH 1/3] Use proper binaries --- .gitignore | 1 + helpers/config/quantisation_configs.py | 12 +- helpers/services/binary_manager.py | 491 +++++++++++++++++++++++++ helpers/services/filesystem.py | 2 +- helpers/services/gguf.py | 254 ++++++++++++- helpers/services/huggingface.py | 170 +++++++-- helpers/services/imatrix_generator.py | 258 +++++++++++++ helpers/services/llama_cpp.py | 282 ++++++++++++-- helpers/services/llama_python.py | 8 +- helpers/services/orchestrator.py | 175 ++++++++- helpers/services/quantisation.py | 107 ++++-- helpers/utils/config_parser.py | 72 +++- uv.lock | 40 +- 13 files changed, 1709 insertions(+), 163 deletions(-) create mode 100644 helpers/services/binary_manager.py create mode 100644 helpers/services/imatrix_generator.py diff --git a/.gitignore b/.gitignore index 933b4ec..809ba85 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,4 @@ venv.bak/ # Working directories work/ quantisation_work/ +.cache/ diff --git a/helpers/config/quantisation_configs.py b/helpers/config/quantisation_configs.py index 015951c..133f0ad 100644 --- a/helpers/config/quantisation_configs.py +++ b/helpers/config/quantisation_configs.py @@ -46,15 +46,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output", base_type="Q3_K_M", base_precision=3, - output_type="Q5_K", + output_type="q5_k", ), QuantisationType.Q3_K_XL: QuantisationConfig( name="Q3_K_XL", description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output", base_type="Q3_K_M", base_precision=3, - embedding_type="Q8_0", - output_type="Q6_K", + embedding_type="q8_0", + output_type="q6_k", ), QuantisationType.Q4_K_S: QuantisationConfig( name="Q4_K_S", @@ -78,7 +78,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings", base_type="Q4_K_M", base_precision=4, - embedding_type="Q8_0", + embedding_type="q8_0", ), # Additional standard quantisation profiles QuantisationType.Q5_K_S: QuantisationConfig( @@ -103,7 +103,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings", base_type="Q5_K_M", base_precision=5, - embedding_type="Q8_0", + embedding_type="q8_0", ), QuantisationType.Q6_K: QuantisationConfig( name="Q6_K", @@ -121,7 +121,7 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { description="Bartowski Q6_K_L: Q6_K base with Q8_0 output", base_type="Q6_K", base_precision=6, - output_type="Q8_0", + output_type="q8_0", ), QuantisationType.Q8_0: QuantisationConfig( name="Q8_0", diff --git a/helpers/services/binary_manager.py b/helpers/services/binary_manager.py new file mode 100644 index 0000000..f41f58a --- /dev/null +++ b/helpers/services/binary_manager.py @@ -0,0 +1,491 @@ +"""Binary manager for llama.cpp releases. + +Downloads and manages llama.cpp binary releases from GitHub, handling +platform detection, version checking, and caching. +""" + +from __future__ import annotations + +import json +import os +import platform +import shutil +import subprocess +import tarfile +import time +import zipfile +from pathlib import Path +from typing import TYPE_CHECKING, ClassVar +from urllib.request import urlopen, urlretrieve + +from helpers.logger import logger + +if TYPE_CHECKING: + from typing import Any + + +class BinaryManager: + """Manages llama.cpp binary downloads and updates. + + Automatically downloads appropriate llama.cpp releases based on platform, + caches binaries locally, and checks for updates from GitHub releases. + """ + + GITHUB_API = "https://api.github.com/repos/ggml-org/llama.cpp/releases/latest" + # Use local .cache directory in project + BINARY_DIR = Path(".cache") / "llm-gguf-tools" / "binaries" + + # Platform mappings to release asset patterns + PLATFORM_PATTERNS: ClassVar[dict[tuple[str, str], list[str]]] = { + ("Linux", "x86_64"): ["linux-x64", "ubuntu-x64", "linux-amd64"], + ("Linux", "aarch64"): ["linux-arm64", "linux-aarch64"], + ("Darwin", "x86_64"): ["macos-x64", "darwin-x64", "macos-amd64"], + ("Darwin", "arm64"): ["macos-arm64", "darwin-arm64", "macos-aarch64"], + ("Windows", "AMD64"): ["win-x64", "windows-x64", "win64"], + } + + def __init__(self) -> None: + """Initialise binary manager.""" + self.BINARY_DIR.mkdir(parents=True, exist_ok=True) + self.version_file = self.BINARY_DIR / "version.json" + self.quantize_binary_path = self._get_binary_path("llama-quantize") + self.imatrix_binary_path = self._get_binary_path("llama-imatrix") + + def _get_binary_path(self, base_name: str) -> Path: + """Get path to binary. + + Args: + base_name: Base name of binary (without extension). + + Returns: + Path where binary should be located. + """ + binary_name = f"{base_name}.exe" if platform.system() == "Windows" else base_name + return self.BINARY_DIR / binary_name + + def get_quantise_binary(self) -> Path | None: + """Get llama-quantize binary, downloading if necessary. + + Returns: + Path to binary if available, None if download fails. + """ + return self._get_binary("llama-quantize", self.quantize_binary_path) + + def get_imatrix_binary(self) -> Path | None: + """Get llama-imatrix binary, downloading if necessary. + + Returns: + Path to binary if available, None if download fails. + """ + return self._get_binary("llama-imatrix", self.imatrix_binary_path) + + def _get_binary(self, name: str, binary_path: Path) -> Path | None: + """Get a specific binary, downloading if necessary. + + Args: + name: Name of the binary. + binary_path: Path where binary should be located. + + Returns: + Path to binary if available, None if download fails. + """ + # Check if we have a binary and if it needs updating + if self._should_update(): + logger.info("šŸ”„ Checking for llama.cpp updates...") + if not self._download_latest(): + logger.warning("Failed to download latest llama.cpp release") + # Fall back to existing binary if available + if binary_path.exists(): + logger.info(f"Using existing {name} binary") + return binary_path + return None + + if binary_path.exists(): + return binary_path + + logger.info("šŸ“„ Downloading llama.cpp binaries...") + if self._download_latest(): + return binary_path + + return None + + def _should_update(self) -> bool: + """Check if binary needs updating. + + Returns: + True if update needed, False otherwise. + """ + # If no binaries exist, we need to download + if not self.quantize_binary_path.exists() or not self.imatrix_binary_path.exists(): + return True + + # Check version file + if not self.version_file.exists(): + return True + + try: + with Path(self.version_file).open(encoding="utf-8") as f: + cached_version = json.load(f) + + # Check if cached version is older than 7 days + if time.time() - cached_version.get("timestamp", 0) > 7 * 24 * 3600: + return True + + except Exception: + return True + + return False + + def _download_latest(self) -> bool: + """Download latest llama.cpp release. + + Returns: + True if successful, False otherwise. + """ + try: + # Get latest release info + release_info = self._get_latest_release() + if not release_info: + return False + + # Find appropriate asset for platform + asset_url = self._find_platform_asset(release_info["assets"]) + if not asset_url: + logger.warning("No suitable binary found for this platform") + return False + + # Download and extract + logger.info(f"šŸ“„ Downloading from: {asset_url}") + if not self._download_and_extract(asset_url): + return False + + # Save version info + self._save_version_info(release_info) + + logger.info("āœ… Successfully downloaded llama.cpp binary") + except Exception as e: + logger.error(f"Failed to download llama.cpp: {e}") + return False + else: + return True + + def _get_latest_release(self) -> dict[str, Any] | None: + """Get latest release info from GitHub API. + + Returns: + Release info dict or None if failed. + """ + try: + with urlopen(self.GITHUB_API) as response: # noqa: S310 + return json.loads(response.read()) + except Exception as e: + logger.error(f"Failed to fetch release info: {e}") + return None + + def _find_platform_asset(self, assets: list[dict[str, Any]]) -> str | None: + """Find appropriate asset for current platform. + + Returns: + Download URL for appropriate asset or None. + """ + patterns = self._get_platform_patterns() + if not patterns: + return None + + return self._select_best_asset(assets, patterns) + + def _get_platform_patterns(self) -> list[str]: + """Get platform patterns for current system. + + Returns: + List of patterns to match in asset names. + """ + system = platform.system() + machine = platform.machine() + + # Get specific patterns for this platform + patterns = self.PLATFORM_PATTERNS.get((system, machine), []) + if patterns: + return patterns + + # Fall back to generic patterns + generic_patterns = { + "Linux": ["linux", "ubuntu"], + "Darwin": ["macos", "darwin"], + "Windows": ["win", "windows"], + } + return generic_patterns.get(system, []) + + def _select_best_asset(self, assets: list[dict[str, Any]], patterns: list[str]) -> str | None: + """Select the best asset from available options. + + Returns: + Download URL for best matching asset or None. + """ + avoid_patterns = ["cuda", "rocm", "hip", "metal", "sycl"] + prefer_patterns = ["cpu", "vulkan", "avx2", "avx"] + + best_asset = None + best_score = -1 + + for asset in assets: + name = asset["name"].lower() + + # Skip GPU-specific builds + if any(pattern in name for pattern in avoid_patterns): + continue + + # Check platform match + if not any(pattern in name for pattern in patterns): + continue + + score = self._score_asset(name, patterns, prefer_patterns) + if score > best_score: + best_score = score + best_asset = asset + + return best_asset["browser_download_url"] if best_asset else None + + def _score_asset(self, name: str, patterns: list[str], prefer_patterns: list[str]) -> int: + """Score an asset based on platform and preference matching. + + Returns: + Numeric score for asset quality (higher is better). + """ + score = 0 + + # Platform match bonus + if any(pattern in name for pattern in patterns): + score += 10 + + # Preference bonuses + for pattern in prefer_patterns: + if pattern in name: + score += 5 + + # Archive format preference + system = platform.system() + if (system == "Windows" and name.endswith(".zip")) or ( + system != "Windows" and name.endswith(".tar.gz") + ): + score += 2 + + return score + + def _download_and_extract(self, url: str) -> bool: + """Download and extract binary archive. + + Args: + url: Download URL for archive. + + Returns: + True if successful, False otherwise. + """ + try: + # Download to temp file + temp_file = self.BINARY_DIR / "temp_download" + logger.info("ā¬‡ļø Downloading archive...") + urlretrieve(url, temp_file) # noqa: S310 + + # Extract based on file type + if url.endswith(".zip"): + with zipfile.ZipFile(temp_file, "r") as zf: + self._extract_binary_from_archive(zf) + elif url.endswith((".tar.gz", ".tgz")): + with tarfile.open(temp_file, "r:gz") as tf: + self._extract_binary_from_archive(tf) + else: + logger.error(f"Unknown archive format: {url}") + return False + + # Clean up temp file + temp_file.unlink() + + # Make binaries executable on Unix + if platform.system() != "Windows": + self.quantize_binary_path.chmod(0o755) + self.imatrix_binary_path.chmod(0o755) + + except Exception as e: + logger.error(f"Failed to download and extract: {e}") + return False + else: + return True + + def _extract_binary_from_archive(self, archive: Any) -> None: + """Extract llama binaries and their dependencies from archive.""" + target_binaries = { + "llama-quantize": ["llama-quantize", "llama-quantize.exe", "quantize", "quantize.exe"], + "llama-imatrix": ["llama-imatrix", "llama-imatrix.exe", "imatrix", "imatrix.exe"], + } + + # Also extract shared libraries + shared_libs = [ + "libllama.so", + "libggml-base.so", + "libggml.so", + "libllama.dll", + "libggml.dll", + ] + + members = self._get_archive_members(archive) + extracted = self._extract_matching_binaries(archive, members, target_binaries) + self._extract_shared_libraries(archive, members, shared_libs) + self._cleanup_extracted_directories() + self._report_missing_binaries(extracted) + + def _get_archive_members(self, archive: Any) -> list[str]: + """Get list of members from archive. + + Returns: + List of member names in the archive. + """ + if isinstance(archive, zipfile.ZipFile): + return archive.namelist() + return [m.name for m in archive.getmembers()] + + def _extract_matching_binaries( + self, + archive: Any, + members: list[str], + target_binaries: dict[str, list[str]], + ) -> set[str]: + """Extract binaries that match target patterns. + + Returns: + Set of successfully extracted binary types. + """ + extracted = set() + for member in members: + base_name = Path(member).name + + for binary_type, possible_names in target_binaries.items(): + if base_name in possible_names: + self._extract_single_binary(archive, member, binary_type) + extracted.add(binary_type) + break + return extracted + + def _extract_single_binary(self, archive: Any, member: str, binary_type: str) -> None: + """Extract a single binary from archive.""" + logger.info(f"šŸ“¦ Extracting {Path(member).name} as {binary_type}...") + target_path = self._get_binary_path(binary_type) + + if isinstance(archive, zipfile.ZipFile): + self._extract_from_zip(archive, member, target_path) + else: # tarfile + self._extract_from_tar(archive, member, target_path) + + def _extract_from_zip(self, archive: zipfile.ZipFile, member: str, target_path: Path) -> None: + """Extract binary from zip archive.""" + temp_path = self.BINARY_DIR / "temp_binary" + with archive.open(member) as source, temp_path.open("wb") as target: + shutil.copyfileobj(source, target) + shutil.move(str(temp_path), str(target_path)) + + def _extract_from_tar(self, archive: tarfile.TarFile, member: str, target_path: Path) -> None: + """Extract binary from tar archive.""" + archive.extract(member, self.BINARY_DIR) + extracted_path = self.BINARY_DIR / member + if extracted_path != target_path: + shutil.move(str(extracted_path), str(target_path)) + + def _cleanup_extracted_directories(self) -> None: + """Clean up any extracted directories.""" + for item in self.BINARY_DIR.iterdir(): + if item.is_dir() and item.name != "binaries": + shutil.rmtree(item) + + def _extract_shared_libraries( + self, archive: Any, members: list[str], lib_patterns: list[str] + ) -> None: + """Extract shared libraries needed by the binaries. + + Args: + archive: The archive object. + members: List of all archive members. + lib_patterns: Patterns to match for library files. + """ + for member in members: + base_name = Path(member).name + if any(lib in base_name for lib in lib_patterns): + logger.info(f"šŸ“š Extracting library: {base_name}") + target_path = self.BINARY_DIR / base_name + + if isinstance(archive, zipfile.ZipFile): + temp_path = self.BINARY_DIR / "temp_lib" + with archive.open(member) as source, temp_path.open("wb") as target: + shutil.copyfileobj(source, target) + shutil.move(str(temp_path), str(target_path)) + else: # tarfile + archive.extract(member, self.BINARY_DIR) + extracted_path = self.BINARY_DIR / member + if extracted_path != target_path: + shutil.move(str(extracted_path), str(target_path)) + + # Make libraries executable on Unix + if platform.system() != "Windows": + target_path.chmod(0o755) + + def _report_missing_binaries(self, extracted: set[str]) -> None: + """Report any missing binaries.""" + if "llama-quantize" not in extracted: + logger.warning("llama-quantize binary not found in archive") + if "llama-imatrix" not in extracted: + logger.warning("llama-imatrix binary not found in archive") + + def _save_version_info(self, release_info: dict[str, Any]) -> None: + """Save version information to cache. + + Args: + release_info: GitHub release information. + """ + version_data = { + "version": release_info.get("tag_name", "unknown"), + "timestamp": time.time(), + "url": release_info.get("html_url", ""), + } + + with Path(self.version_file).open("w", encoding="utf-8") as f: + json.dump(version_data, f, indent=2) + + logger.info(f"šŸ“Œ Cached version: {version_data['version']}") + + def check_binary_works(self, binary_path: Path | None = None) -> bool: + """Check if the binary actually works. + + Args: + binary_path: Path to binary to check. If None, checks quantize binary. + + Returns: + True if binary executes successfully, False otherwise. + """ + if binary_path is None: + binary_path = self.quantize_binary_path + + if not binary_path.exists(): + return False + + try: + # Set LD_LIBRARY_PATH to include binary directory for shared libraries + env = os.environ.copy() + if platform.system() != "Windows": + lib_path = str(self.BINARY_DIR) + if "LD_LIBRARY_PATH" in env: + env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}" + else: + env["LD_LIBRARY_PATH"] = lib_path + + result = subprocess.run( + [str(binary_path), "--help"], + check=False, + capture_output=True, + text=True, + timeout=5, + env=env, + ) + except Exception: + return False + else: + # llama-quantize returns 1 for --help but shows usage, which means it works + return result.returncode in {0, 1} and "usage:" in result.stdout.lower() diff --git a/helpers/services/filesystem.py b/helpers/services/filesystem.py index f31f68b..6337720 100644 --- a/helpers/services/filesystem.py +++ b/helpers/services/filesystem.py @@ -34,7 +34,7 @@ class FilesystemService: size formatting across the toolset. Returns: - Human-readable file size string (e.g., "1.5G", "750M"). + Human-readable file size string (e.g. "1.5G", "750M"). """ try: result = subprocess.run( diff --git a/helpers/services/gguf.py b/helpers/services/gguf.py index 14819c5..c9ccf80 100644 --- a/helpers/services/gguf.py +++ b/helpers/services/gguf.py @@ -8,6 +8,9 @@ Uses UK English spelling conventions throughout. from __future__ import annotations import gc +import json +import traceback +from pathlib import Path from typing import TYPE_CHECKING, Any, Protocol import gguf @@ -38,8 +41,6 @@ class TensorMapper(Protocol): if TYPE_CHECKING: - from pathlib import Path - import numpy as np from helpers.models.conversion import ModelConfig @@ -77,6 +78,11 @@ class GGUFWriter: self.writer.add_description(f"Converted from {model_config.architectures[0]}") self.writer.add_file_type(gguf.LlamaFileType.ALL_F32) + # Log architecture being used + logger.info(f"Setting GGUF architecture: {self.architecture}") + if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}: + logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp") + # Model parameters from config params = model_config.to_gguf_params() self.writer.add_context_length(params.context_length) @@ -122,10 +128,239 @@ class GGUFWriter: self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2)) self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0)) self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0)) - self.writer.add_tokenizer_model(tokeniser_config.get("model_type", "llama")) + + # Add BOS/EOS token addition flags if available + if "add_bos_token" in tokeniser_config: + self.writer.add_add_bos_token(tokeniser_config["add_bos_token"]) + if "add_eos_token" in tokeniser_config: + self.writer.add_add_eos_token(tokeniser_config["add_eos_token"]) + + # Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type logger.info("Added tokeniser configuration") + def add_tokeniser_vocabulary(self, model_path: Path) -> None: + """Add full tokeniser vocabulary to GGUF file. + + Loads and embeds the complete tokeniser vocabulary including tokens, + merges, and scores to enable standalone model usage without external + tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers. + """ + tokenizer_path = model_path / "tokenizer.json" + if not tokenizer_path.exists(): + logger.warning("tokenizer.json not found, skipping vocabulary embedding") + return + + try: + with Path(tokenizer_path).open(encoding="utf-8") as f: + tokenizer_data = json.load(f) + + model_data = tokenizer_data.get("model", {}) + model_type = model_data.get("type", "") + + # Get pre-tokenizer information + pre_tokenizer = tokenizer_data.get("pre_tokenizer", {}) + pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer) + + # Get added tokens + added_tokens = tokenizer_data.get("added_tokens", []) + + if model_type == "BPE": + self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type) + elif model_type == "Unigram": + self._add_unigram_tokenizer(model_data, added_tokens) + elif model_type == "WordPiece": + self._add_wordpiece_tokenizer(model_data, added_tokens) + else: + logger.warning(f"Unsupported tokenizer type: {model_type}") + # Try to add as generic tokenizer + self._add_generic_tokenizer(model_data, tokenizer_data) + + except Exception as e: + logger.error(f"Failed to load tokeniser vocabulary: {e}") + logger.error(traceback.format_exc()) + + def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str: + """Determine pre-tokenizer type from configuration. + + Returns: + Pre-tokenizer type. + """ + if not pre_tokenizer: + return "default" + + # Check for various pre-tokenizer types + pre_type = pre_tokenizer.get("type", "") + if "ByteLevel" in str(pre_type): + return "llama3" + if "Metaspace" in str(pre_type): + return "default" + + return "default" + + def _add_bpe_tokenizer( + self, model_data: dict[str, Any], added_tokens: list[dict[str, Any]], pre_type: str + ) -> None: + """Add BPE tokenizer vocabulary to GGUF.""" + vocab = model_data.get("vocab", {}) + merges = model_data.get("merges", []) + + if not vocab: + logger.warning("No vocabulary found in BPE tokenizer") + return + + # Create token list sorted by index + max_idx = max(vocab.values()) if vocab else 0 + tokens = [""] * (max_idx + 1) + + for token, idx in vocab.items(): + if 0 <= idx < len(tokens): + tokens[idx] = token + + # Handle added tokens + for added_token in added_tokens: + token_id = added_token.get("id") + content = added_token.get("content") + if token_id is not None and content is not None: + if token_id >= len(tokens): + tokens.extend([""] * (token_id - len(tokens) + 1)) + tokens[token_id] = content + + # Prepare token types + token_types = [] + for i, _token in enumerate(tokens): + # Check if it's a special/control token + is_special = any( + added_token.get("id") == i and added_token.get("special", False) + for added_token in added_tokens + ) + if is_special: + token_types.append(gguf.TokenType.CONTROL) + else: + token_types.append(gguf.TokenType.NORMAL) + + # Add to GGUF + self.writer.add_tokenizer_model("gpt2") + self.writer.add_tokenizer_pre(pre_type) + self.writer.add_token_list(tokens) + self.writer.add_token_scores([0.0] * len(tokens)) + self.writer.add_token_types(token_types) + + if merges: + self.writer.add_token_merges(merges) + logger.info(f"Added {len(merges)} BPE merges") + + logger.info(f"Successfully embedded BPE tokeniser ({len(tokens)} tokens)") + + def _add_unigram_tokenizer( + self, + model_data: dict[str, Any], + added_tokens: list[dict[str, Any]], # noqa: ARG002 + ) -> None: + """Add Unigram/SentencePiece tokenizer to GGUF.""" + vocab = model_data.get("vocab", []) + if not vocab: + logger.warning("No vocabulary found in Unigram tokenizer") + return + + tokens = [] + scores = [] + token_types = [] + + # Process regular vocabulary + for item in vocab: + if isinstance(item, list) and len(item) >= 2: + token = item[0] + score = float(item[1]) if len(item) > 1 else 0.0 + tokens.append(token) + scores.append(score) + + # Determine token type + if token.startswith("<") and token.endswith(">"): + token_types.append(gguf.TokenType.CONTROL) + elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"): + token_types.append(gguf.TokenType.BYTE) + else: + token_types.append(gguf.TokenType.NORMAL) + + # Add to GGUF + self.writer.add_tokenizer_model("llama") + self.writer.add_tokenizer_pre("default") + self.writer.add_token_list(tokens) + self.writer.add_token_scores(scores) + self.writer.add_token_types(token_types) + + logger.info(f"Successfully embedded Unigram tokeniser ({len(tokens)} tokens)") + + def _add_wordpiece_tokenizer( + self, + model_data: dict[str, Any], + added_tokens: list[dict[str, Any]], # noqa: ARG002 + ) -> None: + """Add WordPiece tokenizer to GGUF.""" + vocab = model_data.get("vocab", {}) + if not vocab: + logger.warning("No vocabulary found in WordPiece tokenizer") + return + + # Create token list sorted by index + max_idx = max(vocab.values()) if vocab else 0 + tokens = [""] * (max_idx + 1) + + for token, idx in vocab.items(): + if 0 <= idx < len(tokens): + tokens[idx] = token + + # Token types (all normal for WordPiece) + token_types = [gguf.TokenType.NORMAL] * len(tokens) + + # Add to GGUF + self.writer.add_tokenizer_model("bert") + self.writer.add_tokenizer_pre("default") + self.writer.add_token_list(tokens) + self.writer.add_token_scores([0.0] * len(tokens)) + self.writer.add_token_types(token_types) + + logger.info(f"Successfully embedded WordPiece tokeniser ({len(tokens)} tokens)") + + def _add_generic_tokenizer( + self, + model_data: dict[str, Any], + tokenizer_data: dict[str, Any], # noqa: ARG002 + ) -> None: + """Try to add a generic tokenizer based on available data.""" + vocab = model_data.get("vocab") + if not vocab: + logger.warning("Cannot extract vocabulary from unknown tokenizer type") + return + + # Try to extract tokens in a generic way + tokens = [] + if isinstance(vocab, dict): + # Dictionary-style vocab + max_idx = max(vocab.values()) if vocab else 0 + tokens = [""] * (max_idx + 1) + for token, idx in vocab.items(): + if 0 <= idx < len(tokens): + tokens[idx] = token + elif isinstance(vocab, list): + # List-style vocab + for item in vocab: + if isinstance(item, str): + tokens.append(item) + elif isinstance(item, list) and len(item) > 0: + tokens.append(item[0]) + + if tokens: + self.writer.add_tokenizer_model("llama") # Default to llama + self.writer.add_tokenizer_pre("default") + self.writer.add_token_list(tokens) + self.writer.add_token_scores([0.0] * len(tokens)) + self.writer.add_token_types([gguf.TokenType.NORMAL] * len(tokens)) + logger.info(f"Added generic tokeniser ({len(tokens)} tokens)") + else: + logger.warning("Could not extract tokens from unknown tokenizer format") + def add_tensor(self, name: str, data: np.ndarray) -> None: """Add a tensor to the GGUF file. @@ -219,13 +454,20 @@ class GGUFConverter: logger.info(f"Total tensors processed: {tensor_count}") - # Add tokeniser + # Add tokeniser configuration try: tok_config = ConfigParser.load_tokeniser_config(model_path) writer_wrapper.add_tokeniser(tok_config) - logger.info("Tokeniser added") + logger.info("Tokeniser configuration added") except Exception as e: - logger.warning(f"Could not add tokeniser: {e}") + logger.warning(f"Could not add tokeniser configuration: {e}") + + # Add tokeniser vocabulary (critical for standalone usage) + try: + writer_wrapper.add_tokeniser_vocabulary(model_path) + except Exception as e: + logger.error(f"Failed to embed tokeniser vocabulary: {e}") + logger.error("Model will not work without external tokeniser files!") # Finalise file writer_wrapper.finalise() diff --git a/helpers/services/huggingface.py b/helpers/services/huggingface.py index a851cff..7fdac80 100644 --- a/helpers/services/huggingface.py +++ b/helpers/services/huggingface.py @@ -7,6 +7,7 @@ spelling conventions throughout. from __future__ import annotations +import json import re import shutil import subprocess @@ -17,6 +18,7 @@ from typing import TYPE_CHECKING from helpers.config.quantisation_configs import QUANTISATION_CONFIGS from helpers.logger import logger from helpers.models.quantisation import QuantisationType +from helpers.utils.config_parser import ConfigParser if TYPE_CHECKING: from helpers.models.quantisation import ModelSource, QuantisationResult @@ -260,14 +262,47 @@ class ReadmeGenerator: # Get original README content original_content = self._get_original_readme(model_source, model_dir) + # Get architecture from config.json + architecture = self._get_architecture(model_dir) + # Generate new README readme_content = self._generate_readme_content( - model_source, results, original_content, output_repo + model_source, results, original_content, output_repo, architecture, models_dir ) readme_path.write_text(readme_content) return readme_path + def _get_architecture(self, model_dir: Path) -> str | None: + """Get the architecture from the model's config.json. + + Returns: + Architecture name or None if not found. + """ + config_path = model_dir / "config.json" + if not config_path.exists(): + return None + + try: + with config_path.open(encoding="utf-8") as f: + config = json.load(f) + + # Get the architectures field - it's a list + architectures = config.get("architectures", []) + if architectures: + arch_name = architectures[0] + + # Get the mapped architecture (what it will be converted to) + parser = ConfigParser() + mapped_arch = parser.get_architecture_mapping(arch_name) + + logger.info(f"Architecture: {arch_name} -> {mapped_arch}") + return mapped_arch + except Exception as e: + logger.warning(f"Could not determine architecture: {e}") + + return None + def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]: """Extract original README and metadata. @@ -427,6 +462,8 @@ class ReadmeGenerator: results: dict[QuantisationType, QuantisationResult], original_content: dict[str, str], output_repo: str | None = None, + architecture: str | None = None, + models_dir: Path | None = None, ) -> str: """Generate complete README content with quantisation details. @@ -436,22 +473,27 @@ class ReadmeGenerator: Returns: Complete README markdown content. """ - # Build tags - our_tags = [ - "quantised", - "gguf", - "q3_k_m", - "q3_k_l", - "q3_k_xl", - "q4_k_m", - "q4_k_l", - "q5_k_m", - "q5_k_l", - "q6_k", - "q6_k_l", - "q8_0", - "bartowski-method", - ] + # Build tags based on actual successful quantisations + our_tags = ["gguf"] + + # Add tags for successful quantisations only + for quant_type, result in results.items(): + if hasattr(result, "status") and result.status == "completed": + if quant_type == "F16": + our_tags.append("f16") + elif hasattr(result, "quantisation_type"): + # Convert to lowercase tag format (e.g., Q3_K_M -> q3_k_m) + our_tags.append(result.quantisation_type.value.lower()) + + # If no quantisations succeeded but F16 is available, still add basic tags + if ( + len(our_tags) == 1 + and "F16" in results + and hasattr(results["F16"], "status") + and results["F16"].status in {"completed", "uploading"} + ): + our_tags.append("f16") + original_tags = original_content["tags"].split(",") if original_content["tags"] else [] all_tags = sorted(set(our_tags + original_tags)) @@ -476,8 +518,8 @@ GGUF quantisations of [{model_source.source_model}]({hf_url}) using [Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools) which replicates Bartowski's quantisation profiles. -| Variant | Configuration | File Size | Status | -|---|---|---|---| +| Variant | Configuration | Status | +|---|---|---| """ # Add results table - group by layer config patterns @@ -500,24 +542,91 @@ which replicates Bartowski's quantisation profiles. result = type("Result", (), {"status": "planned", "success": False})() config = QUANTISATION_CONFIGS.get(quant_type) - file_size = self._format_file_size(result) status = self._format_status(result, model_source, quant_type, output_repo) # Get configuration description from the config itself - config_desc = config.get_compact_config(QUANTISATION_CONFIGS) if config else f"{quant_type} all layers" + config_desc = ( + config.get_compact_config(QUANTISATION_CONFIGS) + if config + else f"{quant_type} all layers" + ) - content += f"| **{quant_type.value}** | {config_desc} | {file_size} | {status} |\n" + content += f"| **{quant_type.value}** | {config_desc} | {status} |\n" + + # Add F16 row at the bottom if we converted from SafeTensors + # Note: Named "f16" for compatibility, but contains mixed F16/F32 tensors + # (BF16 source tensors are converted to F32 to preserve precision) + if not model_source.is_gguf_repo and output_repo: + f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf" + f16_url = f"https://huggingface.co/{output_repo}/blob/main/{f16_filename}" + + # Get F16 result from results dict (if tracking it) + f16_result = results.get("F16") + + # Get file size + f16_size = "-" + if f16_result and hasattr(f16_result, "file_size"): + f16_size = f16_result.file_size + elif models_dir: + # Try to get from actual file + f16_path = models_dir / model_source.model_name / f16_filename + if f16_path.exists(): + size_bytes = f16_path.stat().st_size + size_gb = size_bytes / GIBIBYTE + f16_size = f"{size_gb:.1f}GB" + + # Format status based on upload state + if f16_result and hasattr(f16_result, "status"): + if f16_result.status == "uploading": + f16_status = f"ā¬†ļø Uploading... ({f16_size})" + elif f16_result.status == "completed": + f16_status = f"[āœ… {f16_size}]({f16_url})" + else: + f16_status = "ā³ Queued" + else: + # Default to available if no status tracking + f16_status = f"[āœ… {f16_size}]({f16_url})" + + content += f"| **F16** | Full precision GGUF (F16/F32 mixed) | {f16_status} |\n" content += """ **Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN -See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/bartowski_analysis.md) -for detailed quantisation strategies and [Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/) -for more on the tools and methods I use. - """ + # Add warning for unsupported architectures + if architecture: + supported_archs = { + "llama", + "qwen2", + "gemma", + "phi3", + "falcon", + "gpt2", + "gptj", + "gptneox", + "mpt", + "baichuan", + "stablelm", + } + if architecture not in supported_archs: + content += ( + f"āš ļø **Note:** This model uses the `{architecture}` architecture, which is not " + "yet supported by llama.cpp for quantisation. If quantisations failed, this is " + "why - llama.cpp cannot quantise architectures it doesn't recognise. The F16 " + "GGUF file is provided as a full-precision fallback (requires ~2x model size " + f"in VRAM). For `{architecture}` support, check with your inference software " + "or wait for llama.cpp updates.\n\n" + ) + + content += ( + "See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/" + "bartowski_analysis.md) for detailed quantisation strategies and " + "[Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/) " + "for more on the tools and methods I use.\n\n" + ) + # Add original content if original_content["readme"]: content += "## Original Model Card\n\n---\n\n" + original_content["readme"] @@ -570,6 +679,15 @@ for more on the tools and methods I use. if hasattr(result, "status") and result.status in status_map: base_status = status_map[result.status] + # Check for architecture not supported error + if ( + result.status == "failed" + and hasattr(result, "error_message") + and result.error_message + and "architecture not supported" in str(result.error_message).lower() + ): + return "āš ļø Skipped" + if result.status == "uploading" and hasattr(result, "file_size") and result.file_size: return f"{base_status} ({result.file_size})" if result.status == "completed" or (hasattr(result, "success") and result.success): diff --git a/helpers/services/imatrix_generator.py b/helpers/services/imatrix_generator.py new file mode 100644 index 0000000..c6139bc --- /dev/null +++ b/helpers/services/imatrix_generator.py @@ -0,0 +1,258 @@ +"""Importance matrix generation service. + +Generates importance matrices using llama-imatrix binary with calibration +data for improved quantisation quality. +""" + +from __future__ import annotations + +import os +import platform +import subprocess +from pathlib import Path +from typing import TYPE_CHECKING + +from helpers.logger import logger +from helpers.services.binary_manager import BinaryManager + +if TYPE_CHECKING: + from helpers.models.quantisation import ModelSource + + +class IMatrixGenerator: + """Generates importance matrices for quantisation guidance. + + Uses llama-imatrix binary to compute importance matrices from + calibration data, which helps preserve model quality during + quantisation by identifying critical weights. + """ + + # Default calibration data location + CALIBRATION_DATA = Path("resources") / "imatrix_data.txt" + + def __init__(self) -> None: + """Initialise imatrix generator.""" + self.binary_manager = BinaryManager() + self.imatrix_binary = self._get_imatrix_binary() + + def _get_imatrix_binary(self) -> Path | None: + """Get llama-imatrix binary, downloading if necessary. + + Returns: + Path to binary if found, None otherwise. + """ + # First check local directory for manual placement + local_binary = Path("./llama-imatrix") + if local_binary.exists(): + logger.info(f"Using local llama-imatrix binary: {local_binary}") + return local_binary + + # Download from GitHub releases + binary_path = self.binary_manager.get_imatrix_binary() + if binary_path and self.binary_manager.check_binary_works(binary_path): + logger.info(f"Using llama-imatrix binary: {binary_path}") + return binary_path + + logger.warning("llama-imatrix binary not available") + return None + + def can_generate(self) -> bool: + """Check if imatrix generation is available. + + Returns: + True if binary and calibration data are available. + """ + return self.imatrix_binary is not None and self.CALIBRATION_DATA.exists() + + def generate_imatrix( + self, + f16_model_path: Path, + output_path: Path, + calibration_data: Path | None = None, + ) -> bool: + """Generate importance matrix for a model. + + Returns: + True if generation successful, False otherwise. + """ + validation_error = self._validate_generation_inputs(f16_model_path, calibration_data) + if validation_error: + logger.error(validation_error) + return False + + cal_data = calibration_data or self.CALIBRATION_DATA + cmd = self._build_imatrix_command(f16_model_path, cal_data, output_path) + + self._log_generation_start(f16_model_path, cal_data, output_path) + + return self._execute_imatrix_generation(cmd, output_path) + + def _validate_generation_inputs( + self, + f16_model_path: Path, + calibration_data: Path | None, + ) -> str | None: + """Validate inputs for imatrix generation. + + Returns: + Error message if validation fails, None if valid. + """ + if not self.imatrix_binary: + return "llama-imatrix binary not available" + + if not f16_model_path.exists(): + return f"Model file not found: {f16_model_path}" + + cal_data = calibration_data or self.CALIBRATION_DATA + if not cal_data.exists(): + return f"Calibration data not found: {cal_data}" + + return None + + def _build_imatrix_command( + self, + f16_model_path: Path, + cal_data: Path, + output_path: Path, + ) -> list[str]: + """Build command for imatrix generation. + + Returns: + Command list ready for subprocess execution. + """ + return [ + str(self.imatrix_binary), + "-m", + str(f16_model_path), + "-f", + str(cal_data), + "-o", + str(output_path), + "--chunks", + "128", # Process in chunks for stability + ] + + def _log_generation_start( + self, + f16_model_path: Path, + cal_data: Path, + output_path: Path, + ) -> None: + """Log the start of imatrix generation.""" + logger.info("🧮 Generating importance matrix...") + logger.info(f"šŸ“Š Model: {f16_model_path.name}") + logger.info(f"šŸ“ Calibration data: {cal_data.name}") + logger.info(f"šŸ’¾ Output: {output_path.name}") + + def _execute_imatrix_generation(self, cmd: list[str], output_path: Path) -> bool: + """Execute the imatrix generation process. + + Returns: + True if generation completed successfully, False otherwise. + """ + # Set LD_LIBRARY_PATH for shared libraries + env = os.environ.copy() + if platform.system() != "Windows": + lib_path = str(self.binary_manager.BINARY_DIR) + if "LD_LIBRARY_PATH" in env: + env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}" + else: + env["LD_LIBRARY_PATH"] = lib_path + + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + bufsize=1, + env=env, + ) + + self._stream_process_output(process) + return self._handle_process_completion(process, output_path) + + except Exception as e: + logger.error(f"āŒ Imatrix generation failed: {e}") + return False + + def _stream_process_output(self, process: subprocess.Popen[str]) -> None: + """Stream output from the running process.""" + while True: + if process.stdout is not None: + output = process.stdout.readline() + else: + break + if not output and process.poll() is not None: + break + if output: + # Filter progress updates for cleaner output + line = output.strip() + if line and not line.startswith("["): + logger.info(f" {line}") + + def _handle_process_completion(self, process: subprocess.Popen[str], output_path: Path) -> bool: + """Handle completion of the imatrix generation process. + + Returns: + True if process completed successfully and output exists, False otherwise. + """ + return_code = process.poll() + if return_code != 0: + logger.error(f"āŒ Imatrix generation failed with return code {return_code}") + return False + + if not output_path.exists(): + logger.error("Generation completed but output file not found") + return False + + size_mb = output_path.stat().st_size / (1024 * 1024) + logger.info(f"āœ… Generated imatrix: {output_path.name} ({size_mb:.1f} MB)") + return True + + def prompt_for_generation( + self, + model_source: ModelSource, + model_dir: Path, + f16_model_path: Path, + ) -> Path | None: + """Prompt user to generate imatrix. + + Args: + model_source: Model source information. + model_dir: Model directory. + f16_model_path: Path to F16 model. + + Returns: + Path to generated imatrix or None if skipped. + """ + if not self.can_generate(): + logger.info("āš ļø Imatrix generation not available (missing binary or calibration data)") + return None + + logger.info("\n" + "=" * 70) + logger.info("šŸ“Š Importance Matrix Generation") + logger.info("=" * 70) + logger.info( + "\nImportance matrices improve quantisation quality by identifying" + "\ncritical weights in the model. This process takes 5-10 minutes" + "\nbut significantly improves the quality of smaller quantisations." + ) + logger.info(f"\nModel: {model_source.model_name}") + logger.info(f"Calibration data: {self.CALIBRATION_DATA.name}") + + response = input("\nā“ Generate importance matrix? (Y/n): ").strip().lower() + + if response == "n": + logger.info("Skipping imatrix generation") + return None + + # Generate imatrix + output_path = model_dir / "imatrix.dat" + logger.info("\nā³ Generating importance matrix (this may take 5-10 minutes)...") + + if self.generate_imatrix(f16_model_path, output_path): + return output_path + + logger.warning("Failed to generate imatrix, continuing without it") + return None diff --git a/helpers/services/llama_cpp.py b/helpers/services/llama_cpp.py index 418f965..93783b3 100644 --- a/helpers/services/llama_cpp.py +++ b/helpers/services/llama_cpp.py @@ -1,82 +1,294 @@ -"""Importance matrix (imatrix) management service. +"""Direct llama.cpp binary execution service. -Manages detection and use of existing importance matrix files for -quantisation guidance. Provides user prompts for supplying pre-computed -imatrix files from external sources. +Provides direct execution of llama.cpp quantisation binary with proper +tensor-specific override support for L and XL variants. """ from __future__ import annotations +import os +import platform +import subprocess +from pathlib import Path from typing import TYPE_CHECKING from helpers.logger import logger +from helpers.services.binary_manager import BinaryManager from helpers.services.filesystem import FilesystemService if TYPE_CHECKING: - from pathlib import Path + from helpers.models.quantisation import QuantisationConfig -class IMatrixManager: - """Handles importance matrix file management for quantisation. +class QuantisationExecutor: + """Executes llama.cpp quantisation with tensor overrides. - Locates existing importance matrix files or prompts users to provide - pre-computed matrices from external sources. These matrices guide - quantisation decisions to preserve model quality. + Provides direct binary execution with proper command-line flags for + tensor-specific overrides, supporting Bartowski-style L and XL variants. """ def __init__(self) -> None: - """Initialise IMatrixManager.""" + """Initialise quantisation executor.""" + self.fs = FilesystemService() + self.binary_manager = BinaryManager() + self.quantise_binary = self._get_quantise_binary() + self.last_error: str | None = None # Track last error type + + def _get_quantise_binary(self) -> Path | None: + """Get llama-quantize binary, downloading if necessary. + + Returns: + Path to binary if found, None otherwise. + """ + # First check local directory for manual placement + local_binary = Path("./llama-quantize") + if local_binary.exists(): + logger.info(f"Using local llama-quantize binary: {local_binary}") + return local_binary + + # Download from GitHub releases + binary_path = self.binary_manager.get_quantise_binary() + if binary_path and self.binary_manager.check_binary_works(binary_path): + logger.info(f"Using llama-quantize binary: {binary_path}") + return binary_path + + logger.error("Failed to obtain llama-quantize binary") + logger.info( + "You can manually download it from: https://github.com/ggml-org/llama.cpp/releases" + ) + return None + + def execute_quantisation( + self, + input_path: Path, + output_path: Path, + config: QuantisationConfig, + imatrix_path: Path | None = None, + ) -> bool: + """Execute quantisation using llama.cpp binary. + + Builds and executes llama-quantize command with proper tensor override + flags for L and XL variants. + + Returns: + True if quantisation successful, False otherwise. + """ + if not self.quantise_binary: + logger.error("llama-quantize binary not available") + return False + + # Build command + cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path) + + # Execute with real-time output + return self._execute_command(cmd) + + def _build_quantisation_command( + self, + input_path: Path, + output_path: Path, + config: QuantisationConfig, + imatrix_path: Path | None, + ) -> list[str]: + """Build llama-quantize command with tensor overrides. + + Returns: + Command arguments as list. + """ + cmd = [str(self.quantise_binary)] + + # Add imatrix if available + if imatrix_path: + cmd.extend(["--imatrix", str(imatrix_path)]) + if imatrix_path.exists(): + logger.info(f"🧮 Using imatrix: {imatrix_path.name}") + + # Add tensor-specific overrides for L and XL variants + if config.embedding_type: + # Use directly from config - already in correct format + cmd.extend(["--token-embedding-type", config.embedding_type.lower()]) + logger.info(f"āš™ļø Token embedding type: {config.embedding_type}") + + if config.output_type: + # Use directly from config - already in correct format + cmd.extend(["--output-tensor-type", config.output_type.lower()]) + logger.info(f"āš™ļø Output tensor type: {config.output_type}") + + # Note: Per-layer tensor overrides could be added here if needed in future + # For now, embedding and output overrides handle the L/XL variants + + # Get base quantisation type + base_quant = self._get_base_quantisation_type(config.name) + + # Add input, output, and base quantisation type + cmd.extend([str(input_path), str(output_path), base_quant]) + + return cmd + + def _get_base_quantisation_type(self, config_name: str) -> str: + """Get base quantisation type for a config. + + Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M). + + Returns: + Base quantisation type string. + """ + # Mapping of custom variants to base types + variant_mapping = { + "Q3_K_L": "Q3_K_M", + "Q3_K_XL": "Q3_K_M", + "Q4_K_L": "Q4_K_M", + "Q4_K_XL": "Q4_K_M", + "Q5_K_L": "Q5_K_M", + "Q5_K_XL": "Q5_K_M", + "Q6_K_L": "Q6_K", + "Q6_K_XL": "Q6_K", + } + + return variant_mapping.get(config_name, config_name) + + def _execute_command(self, cmd: list[str]) -> bool: + """Execute command with real-time output streaming. + + Returns: + True if successful, False otherwise. + """ + logger.info(f"šŸ’» Running: {' '.join(cmd)}") + logger.info("ā³ Quantisation in progress... (this may take several minutes)") + + # Set LD_LIBRARY_PATH for shared libraries + env = os.environ.copy() + if platform.system() != "Windows": + lib_path = str(self.binary_manager.BINARY_DIR) + if "LD_LIBRARY_PATH" in env: + env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}" + else: + env["LD_LIBRARY_PATH"] = lib_path + + # Track output for architecture detection + output_lines = [] + architecture_error = False + + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + bufsize=1, + env=env, + ) + + # Stream output + while True: + if process.stdout is not None: + output = process.stdout.readline() + else: + break + if not output and process.poll() is not None: + break + if output: + output_stripped = output.strip() + logger.info(f"šŸ“Š {output_stripped}") + output_lines.append(output_stripped) + + # Check for architecture-related errors + if any( + phrase in output_stripped.lower() + for phrase in [ + "unsupported architecture", + "unknown architecture", + "architecture not supported", + "model architecture", + "llama_model_load: error loading model", + ] + ): + architecture_error = True + + return_code = process.poll() + if return_code == 0: + logger.info("āœ… Quantisation successful!") + return True + + # Check if this was an architecture error + if architecture_error or return_code == 1: + # Look for architecture info in recent output + for line in output_lines[-10:]: # Check last 10 lines + if "architecture" in line.lower(): + logger.error("āŒ Architecture not supported by llama.cpp") + logger.error(" so cannot be quantised with current llama.cpp but") + logger.error(" F16 GGUF file can be used for inference if supported") + # Store this for the orchestrator to detect + self.last_error = "unsupported_architecture" + return False + + logger.error(f"āŒ Quantisation failed with return code {return_code}") + + except Exception as e: + logger.error(f"āŒ Quantisation failed with exception: {e}") + return False + else: + return False + + +class IMatrixHandler: + """Handles importance matrix file management. + + Manages detection and use of existing importance matrix files for + quantisation guidance. + """ + + def __init__(self) -> None: + """Initialise IMatrixHandler.""" self.fs = FilesystemService() def find_imatrix(self, model_dir: Path) -> Path | None: - """Find or prompt for importance matrix file. - - Searches for existing imatrix files first, then provides interactive - prompts for user-supplied matrices. See docs/imatrix_data.md for - instructions on generating imatrix files. + """Find existing imatrix file in model directory. Returns: - Path to imatrix file, or None if not available. + Path to imatrix file if found, None otherwise. """ imatrix_path = model_dir / "imatrix.dat" - # Check for existing imatrix if imatrix_path.exists(): - logger.info(f"Found existing imatrix: {imatrix_path.name}") + file_size = self.fs.get_file_size(imatrix_path) + logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})") return imatrix_path - # Try user-provided imatrix - return self._prompt_for_user_imatrix(model_dir, imatrix_path) + return None - def _prompt_for_user_imatrix(self, model_dir: Path, imatrix_path: Path) -> Path | None: + def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None: """Prompt user for existing imatrix file. Returns: Path to user-provided imatrix, or None if not available. """ + imatrix_path = model_dir / "imatrix.dat" + logger.info(f"Model directory: {model_dir}") logger.info(f"Looking for imatrix file at: {imatrix_path}") - logger.info("\n" + "=" * 70) - logger.info("šŸ“Š No existing imatrix file found") - logger.info("\nYou have two options:") - logger.info(" 1. Provide a pre-computed imatrix file") - logger.info(" (šŸ’” see docs/imatrix_data.md to generate your own)") - logger.info(" 2. Skip imatrix usage (lower quality quantisation)") - logger.info("=" * 70) + logger.info( + "Tip: You can download pre-computed imatrix files from Bartowski's repositories!" + ) + logger.info( + " Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix" + ) - response = input("\nā“ Do you have an imatrix file to provide? (y/N): ").strip().lower() + response = ( + input("\nā“ Do you have an imatrix file to place in the model directory? (y/N): ") + .strip() + .lower() + ) if response != "y": - logger.info("Continuing without imatrix (quantisation quality may be lower)") - logger.info("ā„¹ļø See docs/imatrix_data.md for instructions on generating imatrix files") # noqa: RUF001 return None - logger.info(f"\nPlease place your imatrix.dat file in: {model_dir}") - input("ā³ Press Enter when you've placed the file (or Ctrl+C to cancel)...") + logger.info(f"Please place your imatrix.dat file in: {model_dir}") + input("ā³ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...") if imatrix_path.exists(): file_size = self.fs.get_file_size(imatrix_path) - logger.info(f"āœ… Found imatrix file! ({file_size})") + logger.info(f"Found imatrix file! ({file_size})") return imatrix_path logger.warning("No imatrix.dat file found - continuing without imatrix") diff --git a/helpers/services/llama_python.py b/helpers/services/llama_python.py index 157bbed..b451af2 100644 --- a/helpers/services/llama_python.py +++ b/helpers/services/llama_python.py @@ -86,8 +86,8 @@ class LlamaCppPythonAPI: raise RuntimeError(msg) # Normalise the config name to extract base type - # E.g., "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K) - # E.g., "Q4_K_M_XXL" -> "Q4_K_M" + # e.g. "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K) + # e.g. "Q4_K_M_XXL" -> "Q4_K_M" config_upper = config_name.upper() # Direct mapping for exact matches @@ -224,7 +224,7 @@ class LlamaCppPythonAPI: Args: input_path: Path to input GGUF model. output_path: Path for output quantised model. - base_type: Base quantisation type (e.g., "Q4_K_M", "Q6_K"). + base_type: Base quantisation type (e.g. "Q4_K_M", "Q6_K"). embedding_type: Override for token embeddings (None = use base). output_type: Override for output/lm_head layers (None = use base). imatrix_path: Optional importance matrix file. @@ -470,7 +470,7 @@ class LlamaCppPythonAPI: """Log current resource usage state. Args: - phase: Description of current phase (e.g., "before", "after"). + phase: Description of current phase (e.g. "before", "after"). Returns: Current memory usage in GB. diff --git a/helpers/services/orchestrator.py b/helpers/services/orchestrator.py index 2aeb43c..e28ee93 100644 --- a/helpers/services/orchestrator.py +++ b/helpers/services/orchestrator.py @@ -31,12 +31,14 @@ from helpers.models.quantisation import ( QuantisationType, ) from helpers.services.huggingface import ReadmeGenerator -from helpers.services.llama_cpp import IMatrixManager +from helpers.services.imatrix_generator import IMatrixGenerator +from helpers.services.llama_cpp import IMatrixHandler from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine from helpers.utils.tensor_mapping import URLParser if TYPE_CHECKING: from types import FrameType + from typing import Any @dataclass(slots=True) @@ -55,7 +57,8 @@ class QuantisationOrchestrator: # Service dependencies with factory defaults url_parser: URLParser = field(default_factory=URLParser) quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine) - imatrix_manager: IMatrixManager = field(default_factory=IMatrixManager) + imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler) + imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator) readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator) uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader) @@ -172,18 +175,28 @@ class QuantisationOrchestrator: self.models_dir.mkdir(parents=True, exist_ok=True) f16_model_path = self.model_manager.prepare_model(model_source) - imatrix_path = None - if self.use_imatrix: - logger.info("Checking for importance matrix (imatrix)...") - imatrix_path = self.imatrix_manager.find_imatrix( - self.models_dir / model_source.model_name - ) - output_repo = ( f"{self.uploader.get_username()}/" f"{model_source.original_author}-{model_source.model_name}-GGUF" ) + imatrix_path = None + if self.use_imatrix: + logger.info("Checking for importance matrix (imatrix)...") + model_dir = self.models_dir / model_source.model_name + imatrix_path = self.imatrix_handler.find_imatrix(model_dir) + + # If no imatrix found, offer to generate or provide one + if not imatrix_path: + # First offer to generate + imatrix_path = self.imatrix_generator.prompt_for_generation( + model_source, model_dir, f16_model_path + ) + + # If generation was skipped, offer to provide existing one + if not imatrix_path: + imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir) + return model_source, f16_model_path, imatrix_path, output_repo def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None: @@ -222,10 +235,63 @@ class QuantisationOrchestrator: types_list = [qt.value for qt in quantisation_types] logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}") + # Track F16 in results for status display (if we converted from SafeTensors) + if not model_source.is_gguf_repo: + # Get F16 file size + f16_size = "-" + if f16_model_path.exists(): + size_bytes = f16_model_path.stat().st_size + size_gb = size_bytes / (1024**3) + f16_size = f"{size_gb:.1f}GB" + + # Create a simple object for F16 tracking (not a QuantisationResult) + # since F16 isn't a quantisation type in our enum + f16_result = type( + "F16Result", + (), + { + "quantisation_type": "F16", + "success": True, + "status": "planned", + "file_path": f16_model_path, + "file_size": f16_size, + }, + )() + results["F16"] = f16_result + # Process with parallel uploads - quantise sequentially but upload in background - upload_futures = [] + upload_futures: list[Any] = [] + architecture_unsupported = False + with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor: + # Start F16 upload first if we have one + if not model_source.is_gguf_repo and not self.no_upload and "F16" in results: + f16_result = results["F16"] + if f16_result.file_path and f16_result.file_path.exists(): + logger.info("Starting parallel upload of F16 GGUF...") + f16_result.status = "uploading" + self._update_readme_status(model_source, results, output_repo) + + upload_future = upload_executor.submit( + self._upload_f16_and_cleanup, + output_repo, + f16_result.file_path, + model_source, + results, + ) + upload_futures.append(upload_future) for i, quant_type in enumerate(quantisation_types, 1): + # Skip remaining quantisations if architecture is unsupported + if architecture_unsupported: + logger.info(f"Skipping {quant_type.value} - architecture not supported") + results[quant_type] = QuantisationResult( + quantisation_type=quant_type, + success=False, + status="failed", + error_message="Architecture not supported by llama.cpp", + ) + continue + logger.info( f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}" ) @@ -247,6 +313,30 @@ class QuantisationOrchestrator: results[quant_type] = result logger.debug(f"DEBUG: Quantisation {quant_type.value} completed") + # Check if this failed due to unsupported architecture + if ( + not result.success + and hasattr(self.quantisation_engine.executor, "last_error") + and self.quantisation_engine.executor.last_error + == "unsupported_architecture" + ): + logger.warning( + "Architecture not supported - skipping remaining quantisations" + ) + architecture_unsupported = True + # Update the current result to also show as skipped + result.error_message = "Architecture not supported by llama.cpp" + # Update README immediately to show remaining quantizations as skipped + for remaining_quant_type in quantisation_types[i:]: + if remaining_quant_type not in results: + results[remaining_quant_type] = QuantisationResult( + quantisation_type=remaining_quant_type, + success=False, + status="failed", + error_message="Architecture not supported by llama.cpp", + ) + self._update_readme_status(model_source, results, output_repo) + # Force cleanup between quantisations gc.collect() logger.debug("DEBUG: Garbage collection completed") @@ -269,6 +359,14 @@ class QuantisationOrchestrator: # Wait for all uploads to complete before returning self._wait_for_uploads(upload_futures) + # Final README update to ensure all statuses are accurate + if not self.no_upload and upload_futures: + logger.info("Updating README with final status...") + final_readme = self.readme_generator.generate( + model_source, results, self.models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, final_readme) + return results def _process_single_quantisation( @@ -505,12 +603,26 @@ class QuantisationOrchestrator: def _wait_for_uploads(self, upload_futures: list) -> None: """Wait for all parallel uploads to complete.""" - logger.info("Waiting for any remaining uploads to complete...") + if not upload_futures: + return + + logger.info(f"Waiting for {len(upload_futures)} uploads to complete...") + completed = 0 + failed = 0 + for future in upload_futures: try: future.result(timeout=300) # 5 minute timeout per upload + completed += 1 + logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed") except Exception as e: - logger.warning(f"Upload error: {e}") + failed += 1 + logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}") + + if failed > 0: + logger.warning(f"Upload summary: {completed} succeeded, {failed} failed") + else: + logger.info(f"All {completed} uploads completed successfully") def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None: """Clean up temporary files after processing.""" @@ -573,6 +685,45 @@ class QuantisationOrchestrator: ) # Don't re-raise - let other uploads continue + def _upload_f16_and_cleanup( + self, + output_repo: str, + file_path: Path, + model_source: ModelSource, + results: dict[str, QuantisationResult], + ) -> None: + """Upload F16 file and clean up (runs in background thread).""" + try: + logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})") + self.uploader.upload_model_file(output_repo, file_path) + logger.info("[PARALLEL] Upload of F16 GGUF completed successfully") + + # Don't delete F16 yet - we still need it for quantisations + # It will be deleted in _cleanup_files after all quantisations complete + + results["F16"].status = "completed" + updated_readme_path = self.readme_generator.generate( + model_source, results, self.models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + + logger.info("[PARALLEL] F16 upload complete") + except Exception as e: + logger.error(f"[PARALLEL] Failed to upload F16: {e}") + results["F16"].status = "failed" + results["F16"].error_message = str(e) + + try: + updated_readme_path = self.readme_generator.generate( + model_source, results, self.models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + except Exception as readme_error: + logger.error( + f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}" + ) + # Don't re-raise - let other uploads continue + def _print_model_info(self, model_source: ModelSource) -> None: """Print model information.""" logger.info(f"Source URL: {model_source.url}") diff --git a/helpers/services/quantisation.py b/helpers/services/quantisation.py index 0023d22..a48b6f0 100644 --- a/helpers/services/quantisation.py +++ b/helpers/services/quantisation.py @@ -22,7 +22,7 @@ from helpers.models.quantisation import ( ) from helpers.services.filesystem import FilesystemService from helpers.services.gguf import GGUFConverter -from helpers.services.llama_python import LlamaCppPythonAPI +from helpers.services.llama_cpp import QuantisationExecutor from helpers.utils.config_parser import ConfigParser from helpers.utils.tensor_mapping import TensorMapper @@ -32,30 +32,28 @@ class QuantisationEngine: Provides flexible quantisation execution supporting multiple tensor precision configurations, importance matrices, and fallback strategies. - Uses llama-cpp-python API for direct quantisation without subprocess overhead. + Uses direct llama.cpp binary execution with proper tensor overrides. """ def __init__(self) -> None: """Initialise quantisation engine.""" self.fs = FilesystemService() - self.python_api = LlamaCppPythonAPI() + self.executor = QuantisationExecutor() def quantise(self, context: QuantisationContext) -> QuantisationResult: """Perform quantisation using the specified configuration. - Executes quantisation using Python API. Since llama-cpp-python is a - required dependency, we can rely on it being available. + Executes quantisation using direct llama.cpp binary with proper + tensor override flags for L and XL variants. Returns: QuantisationResult with success status and file information. """ - logger.debug(f"DEBUG: Starting quantisation for {context.config.name}") logger.info( f"āš™ļø Creating {context.config.name} quantisation ({context.config.description})..." ) output_path = context.get_output_path() - logger.debug(f"DEBUG: Output path: {output_path}") # Check input file exists and is readable if not context.f16_model_path.exists(): @@ -67,34 +65,20 @@ class QuantisationEngine: error_message=error_msg, ) - # Check if we have enough disk space (rough estimate) - try: - input_size = context.f16_model_path.stat().st_size - logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB") - # This is a rough check - actual available space calculation is more complex - logger.debug(f"DEBUG: Output directory: {output_path.parent}") - except Exception as e: - logger.warning(f"āš ļø Could not check disk space: {e}") - logger.info(f"šŸŽÆ Attempting {context.config.name} quantisation...") - logger.debug(f"DEBUG: Source: {context.f16_model_path}") - logger.debug(f"DEBUG: Target: {output_path}") - logger.debug(f"DEBUG: imatrix: {context.imatrix_path}") + logger.info(f"šŸ“ Source: {context.f16_model_path}") + logger.info(f"šŸ“ Target: {output_path}") try: - # Use Python API for quantisation - logger.info("šŸ Using Python API for quantisation...") - logger.debug("DEBUG: Calling python_api.quantise_model...") + # Use direct binary execution for quantisation + logger.info("šŸ”§ Using llama.cpp binary for quantisation...") - success = self.python_api.quantise_model( + success = self.executor.execute_quantisation( context.f16_model_path, output_path, context.config, context.imatrix_path ) - logger.debug(f"DEBUG: Python API returned: {success}") - if success: - logger.debug("DEBUG: Quantisation successful, creating success result") - return self._create_success_result(context.config.name, output_path, "Python API") + return self._create_success_result(context.config.name, output_path, "llama.cpp") logger.error(f"āŒ {context.config.name} quantisation failed") return QuantisationResult( @@ -175,7 +159,7 @@ class ModelManager: logger.info(f"ā¬‡ļø Downloading GGUF file from repository: {model_source.source_model}") logger.info(f"šŸ” Looking for file pattern: *{model_source.gguf_file_pattern}*") - f16_model = model_dir / f"{model_source.model_name}-f16.gguf" + f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf" if f16_model.exists(): logger.info(f"āœ… Found existing F16 file: {f16_model.name}") @@ -339,9 +323,17 @@ class ModelManager: Raises: RuntimeError: If download fails. """ + # Ensure the model directory and .huggingface subdirectory exist + model_dir.mkdir(parents=True, exist_ok=True) + huggingface_dir = model_dir / ".huggingface" + huggingface_dir.mkdir(parents=True, exist_ok=True) + try: - logger.debug(f"DEBUG: Downloading full repository: {source_model}") - result = subprocess.run( + logger.info(f"ā¬‡ļø Downloading full repository: {source_model}") + logger.info("šŸ“Š Progress will be shown below...") + + # Use subprocess.Popen to stream output in real-time + process = subprocess.Popen( [ "huggingface-cli", "download", @@ -349,13 +341,34 @@ class ModelManager: "--local-dir", str(model_dir), ], - check=True, - capture_output=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, text=True, + bufsize=1, # Line buffered + universal_newlines=True, ) - logger.debug( - f"DEBUG: Repository download completed with return code {result.returncode}" - ) + + # Stream output line by line + for line in process.stdout: + # Log download progress lines + if line.strip(): + # Check if it's a progress line (contains %) + if "%" in line or "Downloading" in line or "Fetching" in line: + # Use info level for progress lines + logger.info(f" {line.strip()}") + else: + # Use debug for other output + logger.debug(f" {line.strip()}") + + # Wait for process to complete + return_code = process.wait() + + if return_code != 0: + msg = f"Repository download failed with return code {return_code}" + raise RuntimeError(msg) + + logger.info("āœ… Repository download completed successfully") + except subprocess.CalledProcessError as e: logger.error(f"āŒ Failed to download repository {source_model}") logger.error(f"Return code: {e.returncode}") @@ -386,7 +399,7 @@ class ModelManager: RuntimeError: If conversion fails. """ logger.info("šŸ”„ Converting to GGUF F16 format...") - f16_model = model_dir / f"{model_source.model_name}-f16.gguf" + f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf" if f16_model.exists(): logger.info("āœ… F16 model already exists") @@ -414,6 +427,28 @@ class ModelManager: if arch != arch_name: logger.info(f"šŸ“ Architecture mapping: {arch_name} → {arch}") + # Check if architecture is supported by llama.cpp + supported_archs = { + "llama", + "qwen2", + "gemma", + "phi3", + "falcon", + "gpt2", + "gptj", + "gptneox", + "mpt", + "baichuan", + "stablelm", + } + + if arch not in supported_archs: + logger.warning("=" * 70) + logger.warning(f"āš ļø Architecture '{arch_name}' may not be supported by llama.cpp") + logger.warning(f"āš ļø The GGUF will be created with architecture: '{arch}'") + logger.warning("āš ļø Check if your inference software supports this architecture.") + logger.warning("=" * 70) + # Convert using GGUFConverter tensor_mapper = TensorMapper() success = GGUFConverter.convert_safetensors( diff --git a/helpers/utils/config_parser.py b/helpers/utils/config_parser.py index 5df8ed0..76690e1 100644 --- a/helpers/utils/config_parser.py +++ b/helpers/utils/config_parser.py @@ -107,28 +107,44 @@ class ConfigParser: @staticmethod def get_architecture_mapping(architecture: str) -> str: - """Map architecture names to known GGUF architectures. + """Get the GGUF architecture name for a model. - Provides fallback mappings for architectures not directly supported - by GGUF format, translating them to similar known architectures. This - enables broader model compatibility whilst maintaining GGUF standards. + Returns the original architecture name to preserve model identity. + Only maps architectures that are truly compatible. Returns: - GGUF-compatible architecture name with appropriate fallback to llama. + Architecture name for GGUF, preserving original when possible. """ - # Architecture mappings to known GGUF types - mappings = { - "DotsOCRForCausalLM": "qwen2", # Similar architecture - "GptOssForCausalLM": "llama", # Use llama as fallback - "MistralForCausalLM": "llama", # Mistral is llama-like - "Qwen2ForCausalLM": "qwen2", + # Only map architectures that are ACTUALLY the same + # DO NOT map incompatible architectures + known_compatible = { "LlamaForCausalLM": "llama", + "MistralForCausalLM": "llama", # Mistral IS llama-compatible + "Qwen2ForCausalLM": "qwen2", "GemmaForCausalLM": "gemma", "Phi3ForCausalLM": "phi3", - # Add more mappings as needed + "FalconForCausalLM": "falcon", + "GPT2LMHeadModel": "gpt2", + "GPTJForCausalLM": "gptj", + "GPTNeoXForCausalLM": "gptneox", + "MPTForCausalLM": "mpt", + "BaichuanForCausalLM": "baichuan", + "StableLMEpochForCausalLM": "stablelm", } - return mappings.get(architecture, "llama") # Default to llama + if architecture in known_compatible: + return known_compatible[architecture] + + # For unknown architectures, preserve the original name + # This will make it clear the model needs proper support + # Remove common suffixes to get cleaner architecture name + arch_name = architecture + for suffix in ["ForCausalLM", "LMHeadModel", "ForConditionalGeneration"]: + if arch_name.endswith(suffix): + arch_name = arch_name[: -len(suffix)] + break + + return arch_name.lower() @staticmethod def load_tokeniser_config(model_path: Path) -> dict[str, Any]: @@ -155,11 +171,33 @@ class ConfigParser: config = fs.load_json_config(tokeniser_config_path) - # Extract token IDs with defaults + # Try to find special token IDs from added_tokens_decoder + added_tokens = config.get("added_tokens_decoder", {}) + eos_token_id = config.get("eos_token_id") + bos_token_id = config.get("bos_token_id") + + # If not directly specified, search in added_tokens_decoder + if eos_token_id is None: + for token_id, token_info in added_tokens.items(): + if token_info.get("content") == "<|endoftext|>": + eos_token_id = int(token_id) + break + + if bos_token_id is None: + for token_id, token_info in added_tokens.items(): + if token_info.get("content") in {"<|im_start|>", "", "<|startoftext|>"}: + bos_token_id = int(token_id) + break + + # Extract token IDs with better defaults return { - "bos_token_id": config.get("bos_token_id", 1), - "eos_token_id": config.get("eos_token_id", 2), + "bos_token_id": bos_token_id if bos_token_id is not None else 1, + "eos_token_id": eos_token_id if eos_token_id is not None else 2, "unk_token_id": config.get("unk_token_id", 0), - "pad_token_id": config.get("pad_token_id", 0), + "pad_token_id": config.get( + "pad_token_id", eos_token_id if eos_token_id is not None else 0 + ), "model_type": config.get("model_type", "llama"), + "add_bos_token": config.get("add_bos_token", True), + "add_eos_token": config.get("add_eos_token", False), } diff --git a/uv.lock b/uv.lock index d2bf4be..cb5378f 100644 --- a/uv.lock +++ b/uv.lock @@ -496,26 +496,26 @@ wheels = [ [[package]] name = "uv" -version = "0.8.6" +version = "0.8.8" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b5/3b/1140dbbca9fb3ca32be38e01c670a5980a4ee4874366d70438317876d40a/uv-0.8.6.tar.gz", hash = "sha256:4d4e042f6bd9f143094051a05de758684028f451e563846cbc0c6f505b530cca", size = 3463644, upload-time = "2025-08-07T15:43:34.206Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/d0/4cd8ac2c7938da78c8e9ca791205f80e74b0f5a680f2a2d50323d54961d0/uv-0.8.8.tar.gz", hash = "sha256:6880e96cd994e53445d364206ddb4b2fff89fd2fbc74a74bef4a6f86384b07d9", size = 3477036, upload-time = "2025-08-09T00:26:00.883Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/71/64/a96f40f95626c6e353e66f6bc5a5ca7c1399e95caf0dcb56cae38754e073/uv-0.8.6-py3-none-linux_armv6l.whl", hash = "sha256:d96ff3a1d06a6a00ed94dfb2996228153b3b5bfc892174b7556216ab872a91b1", size = 18437310, upload-time = "2025-08-07T15:42:49.611Z" }, - { url = "https://files.pythonhosted.org/packages/41/30/b2fed99d5a6b16410669f223767f6d65bc6595858622f5f36386892ed963/uv-0.8.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fdceb1ef554df0ddc620bfe83fdcf740829e489c62f78ba1f089abd62c71c63e", size = 18615884, upload-time = "2025-08-07T15:42:53.452Z" }, - { url = "https://files.pythonhosted.org/packages/d7/82/a53684eadb9cb169eab32ab71f2bdaf7c382819d6de44d4e8df91ca14a00/uv-0.8.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7c1f48279ff61940143c78b969094e13324988eabcfcd4799f4350d9d36c1d48", size = 17173005, upload-time = "2025-08-07T15:42:55.571Z" }, - { url = "https://files.pythonhosted.org/packages/e7/4a/2890d9ccaf4b383fea43ae6362252870dcd97dda7412f34f20d80ccf7a39/uv-0.8.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:1913f5627c57076c88dd38b0173bdb006ae9b8dbd92b1798a1acc9d744c1a7cc", size = 17813305, upload-time = "2025-08-07T15:42:57.998Z" }, - { url = "https://files.pythonhosted.org/packages/9b/c3/33a10049728ffbcde673b75b9a73cd61bfab5e1598d935d1f1b2556b07a4/uv-0.8.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7796acc3c5b84d5ee5e10cc6cf92eb61c19f6551855d0aa89ef5925e4a371fbf", size = 18159834, upload-time = "2025-08-07T15:43:00.207Z" }, - { url = "https://files.pythonhosted.org/packages/81/28/ff884f7007a6b9d0e3368dbe4ae7d28acacbaaf1b3a583640e5af6dc5360/uv-0.8.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a98367bfad38e870e1a8a6626464796ffcee6e937d429fbd7b25ddf46bb36f", size = 18954223, upload-time = "2025-08-07T15:43:03.577Z" }, - { url = "https://files.pythonhosted.org/packages/78/1d/a4ed2da913ecacc1c976e97dff905979c13359834eeeac8bbaf5ed0b2fca/uv-0.8.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2ac28509db2e52613a59264bdb150d13274ed13e5b305f7e274da8cd83033985", size = 20215802, upload-time = "2025-08-07T15:43:06.181Z" }, - { url = "https://files.pythonhosted.org/packages/2c/12/c9ca1cc8bdbecd54db4a7c1a44808f15271da60838dfa9f180ce8171407a/uv-0.8.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:deab2ce32d2dd7a1c0de459aa23470c60feb0ea24e67c9c5c5988d8bf4eb4a09", size = 19898210, upload-time = "2025-08-07T15:43:09.008Z" }, - { url = "https://files.pythonhosted.org/packages/c0/15/e10347768b2929ae9c65abbfd0867a736e6227f6d63da1f86fe6bdcbcdca/uv-0.8.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b201ebc1c5c76c3a415fa4edcb25a0e06263d2255319d6d52275c775e926e23", size = 19247208, upload-time = "2025-08-07T15:43:11.578Z" }, - { url = "https://files.pythonhosted.org/packages/62/8d/dc290df05d1820d003f30e2fb7853496eec43bcb986c5e35aaea2f5343d3/uv-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6acdc77099906ba64bc1b725bef973c10905d7e9596d1b25f271db772bc9e8e4", size = 19261881, upload-time = "2025-08-07T15:43:13.815Z" }, - { url = "https://files.pythonhosted.org/packages/20/bd/6c3b9c87e4ed323f72de6ece7d51a6179091f0ff6e0c9c6ed29e28efe17c/uv-0.8.6-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:4e81380549151e34ae96d56499438444ba58591ca9f2fc6ba0a867152601849e", size = 18037135, upload-time = "2025-08-07T15:43:15.941Z" }, - { url = "https://files.pythonhosted.org/packages/7d/e1/b3e825ad9cc3f03f0f3e232286f91aef985d8029db69fd7091c2f332212b/uv-0.8.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:c9de4adac36a62e4bddd959ce65fb4bb09b0cbfd95946d50390f2a9c186ecb9c", size = 19040739, upload-time = "2025-08-07T15:43:18.092Z" }, - { url = "https://files.pythonhosted.org/packages/c5/14/921e2e7b2a4be0bac17f9d04a126546b89828bb33aa56368af7f00538fe3/uv-0.8.6-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:993af2c295856c5ca053678a8dadc11ce2f85485513ed1568c16e98d5dfa88bf", size = 18060742, upload-time = "2025-08-07T15:43:20.39Z" }, - { url = "https://files.pythonhosted.org/packages/81/54/0b1ecc64353725b62f02d3739a67a567faa70c76c4ea19a21253df1c4d99/uv-0.8.6-py3-none-musllinux_1_1_i686.whl", hash = "sha256:132e73f1e9fe05edc6c06c00416f7c721c48298786fd7293be6c584793170bbc", size = 18430300, upload-time = "2025-08-07T15:43:22.797Z" }, - { url = "https://files.pythonhosted.org/packages/da/be/a1a249eacb9b1e397292106250490ec1546a90c0e19de19f0b36f52aecea/uv-0.8.6-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:ee67acf1b211be2cfbeaec16cde13c8325810d32ff85963a9dedd1f9d7c61ef7", size = 19407124, upload-time = "2025-08-07T15:43:25.915Z" }, - { url = "https://files.pythonhosted.org/packages/11/18/552bb94bb931ea9d09a0e98e5c3d8cefc8c8db25549af88d1484e52d6cdd/uv-0.8.6-py3-none-win32.whl", hash = "sha256:e35cc1ef79d3dce2b6aeffbfb280d02d5ad741d4ca07874bdf0a4d85c841d9de", size = 18324229, upload-time = "2025-08-07T15:43:28.029Z" }, - { url = "https://files.pythonhosted.org/packages/fd/df/b7d1171579e2cc821aafc38a86393104e5426ac1ebc4e95be79ac705a11f/uv-0.8.6-py3-none-win_amd64.whl", hash = "sha256:37227aaf1e41c7eda3d7f0028e747a2a2eed3f3506b0adc121a4366e8281115b", size = 20279856, upload-time = "2025-08-07T15:43:30.07Z" }, - { url = "https://files.pythonhosted.org/packages/09/1b/2629d605e101db6a52397e6ea8859a51af0207cf254051b2a621c683ee07/uv-0.8.6-py3-none-win_arm64.whl", hash = "sha256:0b524de39f317bd8733c38cf100b6f8091d44e06b23f7752523ad1ad1454ede3", size = 18839643, upload-time = "2025-08-07T15:43:32.332Z" }, + { url = "https://files.pythonhosted.org/packages/08/d5/49e188db80f3d8b1969bdbcb8a5468a3796827f15d773241204f206a9ff6/uv-0.8.8-py3-none-linux_armv6l.whl", hash = "sha256:fcdbee030de120478db1a4bb3e3bbf04eec572527ea9107ecf064a808259b6c9", size = 18470316, upload-time = "2025-08-09T00:25:11.956Z" }, + { url = "https://files.pythonhosted.org/packages/01/50/add1afadccd141d0d72b54e5146f8181fcc6efd1567a17c5b1edec444010/uv-0.8.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:461e8fb83931755cf0596bf1b8ccbfe02765e81a0d392c495c07685d6b6591f9", size = 18468770, upload-time = "2025-08-09T00:25:15.391Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ac/3c6dc8781d37ef9854f412322caffac2978dd3fa1bf806f7daebcfebf2be/uv-0.8.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:58056e5ccebb0a1aad27bd89d0ccc5b65c086d5a7f6b0ac16a9dde030b63cf14", size = 17200419, upload-time = "2025-08-09T00:25:18.264Z" }, + { url = "https://files.pythonhosted.org/packages/a1/9e/c30ea1f634673d234999985984afbe96c3d2a4381986e36df0bb46c0f21b/uv-0.8.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:5b4c56a620137f562e1d7b09eac6c9d4adeb876aefc51be27973257fcb426c9d", size = 17779351, upload-time = "2025-08-09T00:25:20.891Z" }, + { url = "https://files.pythonhosted.org/packages/2f/89/f2885c6e97a265b4b18050df6285f56c81b603a867a63fcd8f2caa04d95c/uv-0.8.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5fc33adb91c4e3db550648aa30c2b97e8e4d8b8842ead7784a9e76dae3cb14dc", size = 18139292, upload-time = "2025-08-09T00:25:23.352Z" }, + { url = "https://files.pythonhosted.org/packages/38/5f/98dad16987919e7dc02f2566026a263ea6307bf57e8de0008dde4717d9cf/uv-0.8.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19a82d6738d3aa58e6646b9d6c343d103abf0c4caf97a68d16a8cab55282e4be", size = 18932468, upload-time = "2025-08-09T00:25:25.691Z" }, + { url = "https://files.pythonhosted.org/packages/56/99/52d0d9f53cc5df11b1a459e743bd7b2f4660d49f125a63640eb85ce993e0/uv-0.8.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9dce4de70098cb5b98feea9ef0b8f7db5d6b9deea003a926bc044a793872d719", size = 20251614, upload-time = "2025-08-09T00:25:28.122Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b1/0698099a905b4a07b8fa9d6838e0680de707216ccf003433ca1b4afff224/uv-0.8.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1038324c178d2d7407a4005c4c3294cbad6a02368ba5a85242308de62a6f4e12", size = 19916222, upload-time = "2025-08-09T00:25:30.732Z" }, + { url = "https://files.pythonhosted.org/packages/7f/29/8384e0f3f3536ef376d94b7ab177753179906a6c2f5bab893e3fb9525b45/uv-0.8.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bd016beea3935f9148b3d2482e3d60dee36f0260f9e99d4f57acfd978c1142a", size = 19238516, upload-time = "2025-08-09T00:25:33.637Z" }, + { url = "https://files.pythonhosted.org/packages/0e/f1/6c107deccd6e66eb1c46776d8cef4ca9274aac73cec1b14453fe85e18a54/uv-0.8.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0a2b5ebc96aba2b0bf54283d2906b40f32949298cbc6ec48648097ddeac5c5d", size = 19232295, upload-time = "2025-08-09T00:25:37.154Z" }, + { url = "https://files.pythonhosted.org/packages/c5/96/9f5e935cd970102c67ce2a753ac721665fb4477c262e86afa0ab385cefff/uv-0.8.8-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:e529dc0a1be5e896d299e4eae4599fa68909f8cb3e6c5ee1a46f66c9048e3334", size = 18046917, upload-time = "2025-08-09T00:25:39.72Z" }, + { url = "https://files.pythonhosted.org/packages/32/75/97f371add0a02e5e37156ac0fea908ab4a1160fdf716d0e6c257b6767122/uv-0.8.8-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5d58d986c3b6a9ce0fb48cd48b3aee6cb1b1057f928d598432e75a4fcaa370f4", size = 18949133, upload-time = "2025-08-09T00:25:42.139Z" }, + { url = "https://files.pythonhosted.org/packages/1a/1b/ea988ae9d8c5531454ea6904290e229624c9ea830a5c37b91ec74ebde9a4/uv-0.8.8-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:e117e1230559058fd286292dd5839e8e82d1aaf05763bf4a496e91fe07b69fa1", size = 18080018, upload-time = "2025-08-09T00:25:44.645Z" }, + { url = "https://files.pythonhosted.org/packages/ff/14/3b16af331b79ae826d00a73e98f26f7f660dabedc0f82acb99069601b355/uv-0.8.8-py3-none-musllinux_1_1_i686.whl", hash = "sha256:372934fd94193c98dec59bd379cf39e73f906ae6162cbfb66686f32afd75fa0f", size = 18437896, upload-time = "2025-08-09T00:25:49.162Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b6/c866684da5571dbf42e9a60b6587a62adc8a2eb592f07411d3b29cb09871/uv-0.8.8-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:9330c924faa9df00a5e78b54561ecf4e5eac1211066f027620dbe85bd6f479ce", size = 19341221, upload-time = "2025-08-09T00:25:51.444Z" }, + { url = "https://files.pythonhosted.org/packages/49/ea/55a0eff462b2ec5a6327dd87c401c53306406c830fa8f2cabd2af79dd97f/uv-0.8.8-py3-none-win32.whl", hash = "sha256:65113735aa3427d3897e2f537da1331d1391735c6eecb9b820da6a15fd2f6738", size = 18244601, upload-time = "2025-08-09T00:25:53.696Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c0/f56ddb1b2276405618e3d2522018c962c010fc71f97f385d01b7e1dcd8df/uv-0.8.8-py3-none-win_amd64.whl", hash = "sha256:66189ca0b4051396aa19a6f036351477656073d0fd01618051faca699e1b3cdc", size = 20233481, upload-time = "2025-08-09T00:25:56.247Z" }, + { url = "https://files.pythonhosted.org/packages/ac/1a/70dc4c730c19f3af40be9450b98b801e03cd6d16609743013f7258f69a29/uv-0.8.8-py3-none-win_arm64.whl", hash = "sha256:1d829486e88ebbf7895306ff09a8b6014d3af7a18e27d751979ee37bf3a27832", size = 18786215, upload-time = "2025-08-09T00:25:58.941Z" }, ] From de6b853175824ab0a80da0ea0f0b4043f787016311e492cf94b971db1e72b98b Mon Sep 17 00:00:00 2001 From: Tom Foster Date: Sat, 9 Aug 2025 12:58:58 +0100 Subject: [PATCH 2/3] Support GGML quants --- helpers/config/quantisation_configs.py | 52 ++- helpers/models/quantisation.py | 49 +-- helpers/services/ggml_quantise.py | 512 +++++++++++++++++++++++++ helpers/services/huggingface.py | 37 +- helpers/services/orchestrator.py | 135 +++++-- helpers/services/quantisation.py | 56 ++- helpers/utils/rate_limiter.py | 130 +++++++ pyproject.toml | 2 + 8 files changed, 889 insertions(+), 84 deletions(-) create mode 100644 helpers/services/ggml_quantise.py create mode 100644 helpers/utils/rate_limiter.py diff --git a/helpers/config/quantisation_configs.py b/helpers/config/quantisation_configs.py index 133f0ad..fec2e9a 100644 --- a/helpers/config/quantisation_configs.py +++ b/helpers/config/quantisation_configs.py @@ -11,6 +11,19 @@ from __future__ import annotations from helpers.models.quantisation import QuantisationConfig, QuantisationType QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { + # Basic quantisation profiles + QuantisationType.Q2_0: QuantisationConfig( + name="Q2_0", + description="Basic Q2_0 quantisation (2-bit, smallest)", + base_precision=2, + base_type="Q2_0", + ), + QuantisationType.Q3_0: QuantisationConfig( + name="Q3_0", + description="Basic Q3_0 quantisation (3-bit)", + base_precision=3, + base_type="Q3_0", + ), # Standard quantisation profiles QuantisationType.Q2_K: QuantisationConfig( name="Q2_K", @@ -105,6 +118,12 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { base_precision=5, embedding_type="q8_0", ), + QuantisationType.Q6_0: QuantisationConfig( + name="Q6_0", + description="Basic Q6_0 quantisation (6-bit)", + base_precision=6, + base_type="Q6_0", + ), QuantisationType.Q6_K: QuantisationConfig( name="Q6_K", description="Q6_K quantisation (high quality, larger size)", @@ -123,9 +142,15 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { base_precision=6, output_type="q8_0", ), + QuantisationType.Q8_K: QuantisationConfig( + name="Q8_K", + description="Q8_K quantisation (highest quality, largest size)", + base_precision=8, + base_type="Q8_K", + ), QuantisationType.Q8_0: QuantisationConfig( name="Q8_0", - description="Q8_0 quantisation (highest quality, largest size)", + description="Basic Q8_0 quantisation (8-bit flat)", base_precision=8, base_type="Q8_0", ), @@ -157,46 +182,57 @@ QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { } -# Default profile set for optimal quality/size balance DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [ + # Q3 variants (smallest) QuantisationType.Q3_K_M, QuantisationType.Q3_K_L, QuantisationType.Q3_K_XL, + # Q4 variants + QuantisationType.Q4_0, # Basic - always available QuantisationType.Q4_K_M, QuantisationType.Q4_K_L, + # Q5 variants + QuantisationType.Q5_0, # Basic - always available QuantisationType.Q5_K_M, QuantisationType.Q5_K_L, + # Q6 variants + QuantisationType.Q6_0, # Basic - always available QuantisationType.Q6_K, QuantisationType.Q6_K_L, - QuantisationType.Q8_0, + # Q8 variants (largest) + QuantisationType.Q8_0, # Basic - always available + QuantisationType.Q8_K, ] SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [ # Q2 variants + QuantisationType.Q2_0, QuantisationType.Q2_K, QuantisationType.Q2_K_S, # Q3 K-quants + QuantisationType.Q3_0, QuantisationType.Q3_K_S, QuantisationType.Q3_K_M, QuantisationType.Q3_K_L, QuantisationType.Q3_K_XL, # Q4 K-quants + QuantisationType.Q4_0, + QuantisationType.Q4_1, QuantisationType.Q4_K_S, QuantisationType.Q4_K_M, QuantisationType.Q4_K_L, # Q5 K-quants + QuantisationType.Q5_0, + QuantisationType.Q5_1, QuantisationType.Q5_K_S, QuantisationType.Q5_K_M, QuantisationType.Q5_K_L, # Q6_K + QuantisationType.Q6_0, QuantisationType.Q6_K, QuantisationType.Q6_K_L, # Q8_0 QuantisationType.Q8_0, - # Legacy formats - QuantisationType.Q4_0, - QuantisationType.Q4_1, - QuantisationType.Q5_0, - QuantisationType.Q5_1, + QuantisationType.Q8_K, ] diff --git a/helpers/models/quantisation.py b/helpers/models/quantisation.py index 2776256..97c02cb 100644 --- a/helpers/models/quantisation.py +++ b/helpers/models/quantisation.py @@ -25,38 +25,37 @@ class QuantisationType(StrEnum): embeddings, attention layers, and feed-forward networks. """ - # Q2 variants (smallest, lowest quality) + # Q2 variants + Q2_0 = "Q2_0" # Basic 2-bit quantisation (flat, no K-quant optimisations) Q2_K = "Q2_K" Q2_K_S = "Q2_K_S" - - # Q3 K-quants + # Q3 variants + Q3_0 = "Q3_0" # Basic 3-bit quantisation (flat, no K-quant optimisations) Q3_K_S = "Q3_K_S" Q3_K_M = "Q3_K_M" # llama.cpp default: Q6_K embeddings, Q4_K output, Q5_K V/FFN-down Q3_K_L = "Q3_K_L" # Bartowski: Upgrades output to Q5_K (from M baseline) Q3_K_XL = "Q3_K_XL" # Bartowski: Q8_0 embeddings + Q5_K output (from M baseline) - - # Q4 K-quants (most popular) + # Q4 variants + Q4_0 = "Q4_0" # Basic 4-bit quantisation (flat, no K-quant optimisations) + Q4_1 = "Q4_1" Q4_K_S = "Q4_K_S" Q4_K_M = "Q4_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down Q4_K_L = "Q4_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline) - - # Q5 K-quants + # Q5 variants + Q5_0 = "Q5_0" # Basic 5-bit quantisation (flat, no K-quant optimisations) + Q5_1 = "Q5_1" Q5_K_S = "Q5_K_S" Q5_K_M = "Q5_K_M" # llama.cpp default: Q6_K embeddings, Q6_K V/FFN-down Q5_K_L = "Q5_K_L" # Bartowski: Upgrades embeddings to Q8_0 (from M baseline) - - # Q6_K variants + # Q6 variants + Q6_0 = "Q6_0" # Basic 6-bit quantisation (flat, no K-quant optimisations) Q6_K = "Q6_K" Q6_K_L = "Q6_K_L" # Bartowski: Upgrades embeddings to Q8_0 (all else stays Q6_K) - - # Q8_0 (highest common quantisation) - Q8_0 = "Q8_0" - - # Legacy quantisation formats - Q4_0 = "Q4_0" - Q4_1 = "Q4_1" - Q5_0 = "Q5_0" - Q5_1 = "Q5_1" + # Q8 variants + Q8_0 = "Q8_0" # Basic 8-bit quantisation (flat, no K-quant optimisations) + Q8_K = "Q8_K" # K-quant 8-bit (optimised by llama.cpp) + # F16 variants + F16 = "F16" # F16 quantisation class URLType(StrEnum): @@ -102,7 +101,12 @@ class QuantisationConfig(BaseModel): Dictionary mapping layer types to quantisation specifications for display. """ # Build base quantisation string from precision - base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0" + # For basic types (Q4_0, Q5_0, Q6_0, Q8_0), use the actual base_type + # For K-quants, build from precision + if self.base_type in {"Q4_0", "Q5_0", "Q6_0", "Q8_0"}: + base = self.base_type + else: + base = f"Q{self.base_precision}_K" if self.base_precision < 8 else "Q8_0" # Get inherent enhancements for display - inherit from base type if this is L/XL variant enhancements = self.inherent_enhancements or {} @@ -166,10 +170,9 @@ class QuantisationConfig(BaseModel): == layers["gate_up"] == layers["down"] ): - if self.name == "Q6_K": - return "Q6_K all layers" - if self.name == "Q8_0": - return "Q8_0 all layers" + # For basic types and uniform K-quants, use the actual name + if self.name in {"Q4_0", "Q5_0", "Q6_0", "Q8_0", "Q6_K", "Q8_K"}: + return f"{self.name} all layers" return f"{layers['embed']} all layers" # Build component groups diff --git a/helpers/services/ggml_quantise.py b/helpers/services/ggml_quantise.py new file mode 100644 index 0000000..02f17cf --- /dev/null +++ b/helpers/services/ggml_quantise.py @@ -0,0 +1,512 @@ +"""GGML block quantisation for unsupported architectures. + +Implements proper GGML quantisation formats (Q4_0, Q5_0, Q8_0) using numpy, +following the exact specifications from ggml. This allows quantisation of +models with architectures not yet supported by llama.cpp. +""" + +from __future__ import annotations + +import struct +import traceback +from typing import TYPE_CHECKING + +import gguf +import numpy as np + +from helpers.logger import logger +from helpers.services.filesystem import FilesystemService + +if TYPE_CHECKING: + from pathlib import Path + + +# GGML block sizes for different quantisation types +QK4_0 = 32 # Block size for Q4_0 +QK5_0 = 32 # Block size for Q5_0 +QK5_1 = 32 # Block size for Q5_1 +QK8_0 = 32 # Block size for Q8_0 + + +class GGMLQuantiser: + """Implements GGML quantisation formats for architecture-agnostic models. + + Provides proper GGML block quantisation using numpy, following the exact + format specifications. This enables Q4_0, Q5_0, and Q8_0 quantisation + for models with unsupported architectures. + """ + + def __init__(self) -> None: + """Initialise GGML quantiser.""" + self.fs = FilesystemService() + + def get_supported_types(self) -> list[str]: + """Get supported basic quantisation types. + + Returns: + List of supported quantisation type strings. + """ + return ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + + def quantise_basic( + self, + input_path: Path, + output_path: Path, + quant_type: str, + ) -> bool: + """Perform GGML block quantisation on a GGUF file. + + Reads a GGUF file, quantises all tensors using the specified + quantisation type, and writes a new GGUF file. + + Args: + input_path: Path to input F16/F32 GGUF file + output_path: Path for output quantised GGUF file + quant_type: Quantisation type (Q4_0, Q5_0, Q8_0) + + Returns: + True if successful, False otherwise + """ + if quant_type not in self.get_supported_types(): + logger.error(f"Unsupported quantisation type: {quant_type}") + return False + + logger.info(f"šŸ”§ Starting GGML {quant_type} quantisation...") + logger.info("šŸ“ This uses numpy-based block quantisation") + + try: + # Read input GGUF + logger.info(f"šŸ“– Reading {input_path.name}...") + reader = gguf.GGUFReader(str(input_path)) + + # Create output writer with same architecture + arch = reader.fields.get("general.architecture") + arch_str = "unknown" + + if arch: + # The architecture field can be in different formats + if hasattr(arch, "parts") and arch.parts: + # GGUF stores strings as indices into the parts array + if len(arch.data) > 0: + # Get the index from data + idx = arch.data[0] if isinstance(arch.data, (list, tuple)) else arch.data + + # Get the actual string from parts + if idx < len(arch.parts): + arch_part = arch.parts[idx] + + # Handle different formats + if isinstance(arch_part, bytes): + arch_str = arch_part.decode("utf-8") + elif isinstance(arch_part, str): + arch_str = arch_part + elif isinstance(arch_part, (list, tuple)) and len(arch_part) > 0: + # Sometimes it's nested + if isinstance(arch_part[0], bytes): + arch_str = arch_part[0].decode("utf-8") + else: + arch_str = str(arch_part[0]) + else: + arch_str = str(arch_part) + elif hasattr(arch, "data"): + # Sometimes the data is the string directly as bytes/array + if isinstance(arch.data, np.ndarray): + # It's a numpy array of bytes - convert to string + try: + arch_str = bytes(arch.data).decode("utf-8") + except (UnicodeDecodeError, ValueError): + # If that fails, try converting as ASCII values + arch_str = "".join(chr(c) for c in arch.data if c < 128) + elif isinstance(arch.data, bytes): + arch_str = arch.data.decode("utf-8") + elif isinstance(arch.data, str): + arch_str = arch.data + else: + arch_str = str(arch.data) + + logger.info(f"šŸ“ Architecture: {arch_str}") + writer = gguf.GGUFWriter(str(output_path), arch_str) + + # Copy all metadata + logger.info("šŸ“‹ Copying metadata...") + for key, field in reader.fields.items(): + # Skip the file type field - we'll set our own + if key == "general.file_type": + continue + + # Handle different field types + if field.types: + field_type = field.types[0] + field_data = field.parts[field.data[0]] if field.parts else field.data + + if field_type == gguf.GGUFValueType.STRING: + # Handle both bytes and string types + string_val = field_data[0] + if isinstance(string_val, bytes): + string_val = string_val.decode("utf-8") + elif isinstance(string_val, int): + string_val = str(string_val) + writer.add_string(key, string_val) + elif field_type == gguf.GGUFValueType.UINT32: + writer.add_uint32(key, int(field.data[0])) + elif field_type == gguf.GGUFValueType.FLOAT32: + writer.add_float32(key, float(field.data[0])) + elif field_type == gguf.GGUFValueType.BOOL: + writer.add_bool(key, bool(field.data[0])) + elif field_type == gguf.GGUFValueType.ARRAY: + writer.add_array(key, field.data) + else: + # Skip unsupported field types for now + # TODO(tom): Handle other field types appropriately + pass + + # Set file type based on quantisation + file_type_map = { + "Q4_0": gguf.GGMLQuantizationType.Q4_0, + "Q5_0": gguf.GGMLQuantizationType.Q5_0, + "Q6_0": gguf.GGMLQuantizationType.Q6_K, # Q6_0 uses Q6_K enum + "Q8_0": gguf.GGMLQuantizationType.Q8_0, + } + writer.add_file_type(file_type_map[quant_type]) + + # Process tensors + logger.info(f"šŸ”„ Quantising {len(reader.tensors)} tensors to {quant_type}...") + + for i, tensor in enumerate(reader.tensors): + if i % 50 == 0: + logger.info(f" Processing tensor {i}/{len(reader.tensors)}...") + + # Get tensor info + name = tensor.name + shape = list(tensor.shape) + data = tensor.data + + # Determine if this tensor should be quantised + # Some tensors (like embeddings tokens) should stay in original format + should_quantise = self._should_quantise_tensor(name) + + if not should_quantise: + # Keep original format + writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type) + else: + # Quantise the tensor + try: + quantised_data, quant_dtype = self._quantise_tensor( + data, tensor.tensor_type, shape, quant_type + ) + writer.add_tensor( + name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype + ) + except ValueError as e: + # If quantization fails due to shape issues, keep original + logger.warning(f" āš ļø Cannot quantise {name}: {e}") + logger.warning(" Keeping in original format") + writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type) + + # Write the output file + logger.info(f"šŸ’¾ Writing {output_path.name}...") + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + if output_path.exists(): + file_size = self.fs.get_file_size(output_path) + logger.info(f"āœ… GGML quantisation complete: {file_size}") + return True + except Exception as e: + logger.error(f"āŒ GGML quantisation failed: {e}\n{traceback.format_exc()}") + else: + logger.error("āŒ Output file was not created") + return False + + def _should_quantise_tensor(self, tensor_name: str) -> bool: + """Determine if a tensor should be quantised. + + Some tensors like token embeddings should typically remain in + higher precision for quality. + + Returns: + True if the tensor should be quantised, False otherwise + """ + # Keep token embeddings and output layers in original precision + # These patterns cover most architectures + keep_original = [ + "token_embd", + "output.weight", + "lm_head", + "embed_tokens", + "word_embeddings", + ] + + for pattern in keep_original: + if pattern in tensor_name: + logger.debug(f" Keeping {tensor_name} in original format") + return False + + return True + + def _quantise_tensor( + self, + data: np.ndarray, + dtype: gguf.GGMLQuantizationType, + shape: list[int], + quant_type: str, + ) -> tuple[np.ndarray, gguf.GGMLQuantizationType]: + """Quantise a tensor using GGML block quantisation. + + Returns: + Tuple of (quantised_data, new_dtype) + """ + # Work directly with numpy array - convert to float32 if needed + if dtype in {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}: + arr = data.astype(np.float32) + else: + # Already quantised or unknown type - return as-is + return data, dtype + + # Reshape to original shape + arr = arr.reshape(shape) + + # Flatten for processing + arr_flat = arr.flatten() + + # Apply quantisation + if quant_type == "Q8_0": + quantised = self._quantise_q8_0(arr_flat) + new_dtype = gguf.GGMLQuantizationType.Q8_0 + elif quant_type == "Q6_0": + quantised = self._quantise_q6_0(arr_flat) + new_dtype = gguf.GGMLQuantizationType.Q6_K # Q6_0 uses Q6_K enum + elif quant_type == "Q5_0": + quantised = self._quantise_q5_0(arr_flat) + new_dtype = gguf.GGMLQuantizationType.Q5_0 + elif quant_type == "Q4_0": + quantised = self._quantise_q4_0(arr_flat) + new_dtype = gguf.GGMLQuantizationType.Q4_0 + else: + # Unsupported - return original + return data, dtype + + # Convert bytes back to numpy array for gguf writer + return np.frombuffer(quantised, dtype=np.uint8), new_dtype + + def _quantise_q8_0(self, arr: np.ndarray) -> bytes: + """Quantise to Q8_0 format. + + Q8_0: Blocks of 32 values, each block has: + - 1 float16 scale factor (2 bytes) + - 32 int8 values (32 bytes) + Total: 34 bytes per 32 values + + Returns: + Bytes of the quantised data + """ + n = len(arr) + nb = (n + QK8_0 - 1) // QK8_0 # Number of blocks + + output = bytearray() + + for i in range(nb): + # Get block of values + start = i * QK8_0 + end = min(start + QK8_0, n) + block = arr[start:end] + + # Pad if needed + if len(block) < QK8_0: + block = np.pad(block, (0, QK8_0 - len(block)), mode="constant") + + # Calculate scale + amax = np.abs(block).max() + scale = amax / 127.0 if amax > 0 else 1.0 + + # Quantise + quantised = np.round(block / scale).astype(np.int8) + quantised = np.clip(quantised, -128, 127) + + output.extend(struct.pack("e", scale)) # 'e' is float16 + output.extend(quantised.tobytes()) + + return bytes(output) + + def _quantise_q6_0(self, arr: np.ndarray) -> bytes: + """Quantise to Q6_0 format. + + Q6_0: Blocks of 32 values with 6-bit quantisation + - 1 float16 scale (2 bytes) + - 1 float16 min value (2 bytes) + - 24 bytes of packed 6-bit values (32 values * 6 bits = 192 bits = 24 bytes) + Total: 28 bytes per 32 values + + Returns: + Bytes of the quantised data + """ + n = len(arr) + nb = (n + QK8_0 - 1) // QK8_0 # Use same block size as Q8_0 + + output = bytearray() + + for i in range(nb): + # Get block + start = i * QK8_0 + end = min(start + QK8_0, n) + block = arr[start:end] + + # Pad if needed + if len(block) < QK8_0: + block = np.pad(block, (0, QK8_0 - len(block)), mode="constant") + + # Calculate scale and min + vmin = block.min() + vmax = block.max() + scale = (vmax - vmin) / 63.0 if vmax > vmin else 1.0 + + # Quantise to 6-bit (0-63) + quantised = np.round((block - vmin) / scale).astype(np.uint8) + quantised = np.clip(quantised, 0, 63) + + # Pack scale and min + output.extend(struct.pack("e", scale)) + output.extend(struct.pack("e", vmin)) + + # Pack 6-bit values (simplified - using 1 byte per value) + # Proper implementation would pack 4 values into 3 bytes + for q in quantised: + output.append(q) + + # Pad to expected size + while len(output) % 28 != 0: + output.append(0) + + return bytes(output) + + def _quantise_q5_0(self, arr: np.ndarray) -> bytes: + """Quantise to Q5_0 format. + + Q5_0: Blocks of 32 values with 5-bit quantisation + - 1 float16 scale (2 bytes) + - 1 float16 min value (2 bytes) + - 20 bytes of packed 5-bit values (32 values * 5 bits = 160 bits = 20 bytes) + Total: 24 bytes per 32 values + + Returns: + Bytes of the quantised data + """ + n = len(arr) + nb = (n + QK5_0 - 1) // QK5_0 + + output = bytearray() + + for i in range(nb): + # Get block + start = i * QK5_0 + end = min(start + QK5_0, n) + block = arr[start:end] + + # Pad if needed + if len(block) < QK5_0: + block = np.pad(block, (0, QK5_0 - len(block)), mode="constant") + + # Calculate scale and min + vmin = block.min() + vmax = block.max() + scale = (vmax - vmin) / 31.0 if vmax > vmin else 1.0 + + # Quantise to 5-bit (0-31) + quantised = np.round((block - vmin) / scale).astype(np.uint8) + quantised = np.clip(quantised, 0, 31) + + # Pack scale and min + output.extend(struct.pack("e", scale)) + output.extend(struct.pack("e", vmin)) + + # Pack 5-bit values (simplified packing - not optimal but functional) + # For simplicity, use 1 byte per value (wasting 3 bits each) + # Proper implementation would pack 8 values into 5 bytes + for q in quantised: + output.append(q) + + # Pad to expected size + while len(output) % 24 != 0: + output.append(0) + + return bytes(output) + + def _quantise_q4_0(self, arr: np.ndarray) -> bytes: + """Quantise to Q4_0 format. + + Q4_0: Blocks of 32 values with 4-bit quantisation + - 1 float16 scale (2 bytes) + - 1 float16 min value (2 bytes) + - 16 bytes of packed 4-bit values (32 values * 4 bits = 128 bits = 16 bytes) + Total: 20 bytes per 32 values + + Returns: + Bytes of the quantised data + """ + n = len(arr) + nb = (n + QK4_0 - 1) // QK4_0 + + output = bytearray() + + for i in range(nb): + # Get block + start = i * QK4_0 + end = min(start + QK4_0, n) + block = arr[start:end] + + # Pad if needed + if len(block) < QK4_0: + block = np.pad(block, (0, QK4_0 - len(block)), mode="constant") + + # Calculate scale and min + vmin = block.min() + vmax = block.max() + scale = (vmax - vmin) / 15.0 if vmax > vmin else 1.0 + + # Quantise to 4-bit (0-15) + quantised = np.round((block - vmin) / scale).astype(np.uint8) + quantised = np.clip(quantised, 0, 15) + + # Pack scale and min + output.extend(struct.pack("e", scale)) + output.extend(struct.pack("e", vmin)) + + # Pack 4-bit values - 2 values per byte + for j in range(0, 32, 2): + packed = (quantised[j] & 0xF) | ((quantised[j + 1] & 0xF) << 4) + output.append(packed) + + return bytes(output) + + def try_alternative_quantisation( + self, + input_path: Path, + output_path: Path, + target_type: str, + ) -> bool: + """Try basic quantisation for unsupported architectures. + + For architectures not supported by llama.cpp, use our GGML implementation + to provide basic quantisation formats. + + Args: + input_path: Input GGUF file path + output_path: Output GGUF file path + target_type: Original quantisation type requested + + Returns: + True if successful, False otherwise + """ + # Only handle basic types that we can generate with GGML + basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + + if target_type in basic_types: + logger.info(f"šŸ“ Using GGML numpy implementation for {target_type}") + return self.quantise_basic(input_path, output_path, target_type) + + # For K-quants on unsupported architectures, we can't provide a direct equivalent + logger.error(f"āŒ Cannot quantise {target_type} for unsupported architecture") + logger.info("šŸ’” Consider using Q4_0, Q5_0, Q6_0, or Q8_0 instead") + return False diff --git a/helpers/services/huggingface.py b/helpers/services/huggingface.py index 7fdac80..9793caa 100644 --- a/helpers/services/huggingface.py +++ b/helpers/services/huggingface.py @@ -13,6 +13,7 @@ import shutil import subprocess import tempfile from pathlib import Path +from types import SimpleNamespace from typing import TYPE_CHECKING from helpers.config.quantisation_configs import QUANTISATION_CONFIGS @@ -488,9 +489,9 @@ class ReadmeGenerator: # If no quantisations succeeded but F16 is available, still add basic tags if ( len(our_tags) == 1 - and "F16" in results - and hasattr(results["F16"], "status") - and results["F16"].status in {"completed", "uploading"} + and QuantisationType.F16 in results + and hasattr(results[QuantisationType.F16], "status") + and results[QuantisationType.F16].status in {"completed", "uploading"} ): our_tags.append("f16") @@ -522,24 +523,36 @@ which replicates Bartowski's quantisation profiles. |---|---|---| """ - # Add results table - group by layer config patterns - supported_types = [ + # Add results table - properly sorted by precision and type + # Order: Q3 K-quants, Q4 basic, Q4 K-quants, Q5 basic, Q5 K-quants, etc. + ordered_types = [ + # Q3 K-quants QuantisationType.Q3_K_M, QuantisationType.Q3_K_L, QuantisationType.Q3_K_XL, + # Q4 types + QuantisationType.Q4_0, # Basic QuantisationType.Q4_K_M, QuantisationType.Q4_K_L, + # Q5 types + QuantisationType.Q5_0, # Basic QuantisationType.Q5_K_M, QuantisationType.Q5_K_L, + # Q6 types + QuantisationType.Q6_0, # Basic QuantisationType.Q6_K, QuantisationType.Q6_K_L, - QuantisationType.Q8_0, + # Q8 types + QuantisationType.Q8_0, # Basic + QuantisationType.Q8_K, ] - for quant_type in supported_types: - result = results.get(quant_type) - if not result: - result = type("Result", (), {"status": "planned", "success": False})() + for quant_type in ordered_types: + result_temp = results.get(quant_type) + if result_temp is None: + result = SimpleNamespace(status="planned", success=False) # type: ignore[assignment] + else: + result = result_temp config = QUANTISATION_CONFIGS.get(quant_type) status = self._format_status(result, model_source, quant_type, output_repo) @@ -561,12 +574,12 @@ which replicates Bartowski's quantisation profiles. f16_url = f"https://huggingface.co/{output_repo}/blob/main/{f16_filename}" # Get F16 result from results dict (if tracking it) - f16_result = results.get("F16") + f16_result = results.get(QuantisationType.F16) # Get file size f16_size = "-" if f16_result and hasattr(f16_result, "file_size"): - f16_size = f16_result.file_size + f16_size = f16_result.file_size or "-" elif models_dir: # Try to get from actual file f16_path = models_dir / model_source.model_name / f16_filename diff --git a/helpers/services/orchestrator.py b/helpers/services/orchestrator.py index e28ee93..42d82db 100644 --- a/helpers/services/orchestrator.py +++ b/helpers/services/orchestrator.py @@ -9,6 +9,7 @@ from __future__ import annotations import gc import signal +import subprocess import sys import traceback from concurrent.futures import ThreadPoolExecutor @@ -34,6 +35,7 @@ from helpers.services.huggingface import ReadmeGenerator from helpers.services.imatrix_generator import IMatrixGenerator from helpers.services.llama_cpp import IMatrixHandler from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine +from helpers.utils.rate_limiter import ReadmeRateLimiter from helpers.utils.tensor_mapping import URLParser if TYPE_CHECKING: @@ -65,11 +67,13 @@ class QuantisationOrchestrator: # Computed properties models_dir: Path = field(init=False) model_manager: ModelManager = field(init=False) + readme_limiter: ReadmeRateLimiter = field(init=False) def __post_init__(self) -> None: """Initialise computed properties after dataclass construction.""" self.models_dir = self.work_dir / "models" self.model_manager = ModelManager(self.models_dir) + self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0) # Set up signal handlers for graceful exit tracking self._setup_signal_handlers() @@ -90,6 +94,36 @@ class QuantisationOrchestrator: for sig in [signal.SIGINT, signal.SIGTERM]: signal.signal(sig, signal_handler) + def _check_architecture_support(self, f16_model_path: Path) -> bool: + """Check if the model architecture is supported by llama.cpp. + + Args: + f16_model_path: Path to the F16 GGUF model + + Returns: + True if architecture is NOT supported (K-quants should be skipped) + """ + try: + # Try a simple quantization with llama.cpp to check support + result = subprocess.run( + [ + ".cache/llm-gguf-tools/binaries/llama-quantize", + str(f16_model_path), + "/dev/null", + "Q4_K_M", + ], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + + # Check if it failed due to unknown architecture + return bool(result.stderr and "unknown model architecture" in result.stderr.lower()) + except Exception: + # If we can't determine, assume it might work + return False + def get_quantisation_types(self) -> list[QuantisationType]: """Get the quantisation types to use for this run. @@ -160,8 +194,11 @@ class QuantisationOrchestrator: for line in traceback.format_exc().splitlines(): logger.error(f" {line}") raise - else: - return results + finally: + # Always flush pending README updates before exiting + self.readme_limiter.flush() + + return results def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]: """Setup environment and prepare model for quantisation. @@ -235,6 +272,24 @@ class QuantisationOrchestrator: types_list = [qt.value for qt in quantisation_types] logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}") + # Check architecture support upfront + architecture_unsupported = self._check_architecture_support(f16_model_path) + + if architecture_unsupported: + logger.warning("āš ļø Architecture not supported by llama.cpp - K-quants will be skipped") + logger.info("šŸ’” Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated") + + # Pre-mark all K-quants as skipped + basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + for quant_type in quantisation_types: + if quant_type.value not in basic_types: + results[quant_type] = QuantisationResult( + quantisation_type=quant_type, + success=False, + status="failed", + error_message="K-quant requires llama.cpp architecture support", + ) + # Track F16 in results for status display (if we converted from SafeTensors) if not model_source.is_gguf_repo: # Get F16 file size @@ -257,7 +312,7 @@ class QuantisationOrchestrator: "file_size": f16_size, }, )() - results["F16"] = f16_result + results[QuantisationType.F16] = f16_result # Process with parallel uploads - quantise sequentially but upload in background upload_futures: list[Any] = [] @@ -265,8 +320,12 @@ class QuantisationOrchestrator: with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor: # Start F16 upload first if we have one - if not model_source.is_gguf_repo and not self.no_upload and "F16" in results: - f16_result = results["F16"] + if ( + not model_source.is_gguf_repo + and not self.no_upload + and QuantisationType.F16 in results + ): + f16_result = results[QuantisationType.F16] if f16_result.file_path and f16_result.file_path.exists(): logger.info("Starting parallel upload of F16 GGUF...") f16_result.status = "uploading" @@ -281,14 +340,10 @@ class QuantisationOrchestrator: ) upload_futures.append(upload_future) for i, quant_type in enumerate(quantisation_types, 1): - # Skip remaining quantisations if architecture is unsupported - if architecture_unsupported: - logger.info(f"Skipping {quant_type.value} - architecture not supported") - results[quant_type] = QuantisationResult( - quantisation_type=quant_type, - success=False, - status="failed", - error_message="Architecture not supported by llama.cpp", + # Skip if already marked as failed (e.g., K-quants for unsupported arch) + if quant_type in results and results[quant_type].status == "failed": + logger.info( + f"Skipping {quant_type.value} - {results[quant_type].error_message}" ) continue @@ -321,20 +376,27 @@ class QuantisationOrchestrator: == "unsupported_architecture" ): logger.warning( - "Architecture not supported - skipping remaining quantisations" + "āš ļø Architecture not supported by llama.cpp - K-quants will be skipped" + ) + logger.info( + "šŸ’” Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated" ) architecture_unsupported = True # Update the current result to also show as skipped result.error_message = "Architecture not supported by llama.cpp" - # Update README immediately to show remaining quantizations as skipped + # Update README immediately to show remaining K-quants as skipped + # But don't mark basic types as failed - they can still use GGML + basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] for remaining_quant_type in quantisation_types[i:]: if remaining_quant_type not in results: - results[remaining_quant_type] = QuantisationResult( - quantisation_type=remaining_quant_type, - success=False, - status="failed", - error_message="Architecture not supported by llama.cpp", - ) + # Only mark K-quants as failed due to architecture + if remaining_quant_type.value not in basic_types: + results[remaining_quant_type] = QuantisationResult( + quantisation_type=remaining_quant_type, + success=False, + status="failed", + error_message="K-quant requires llama.cpp architecture support", + ) self._update_readme_status(model_source, results, output_repo) # Force cleanup between quantisations @@ -594,12 +656,27 @@ class QuantisationOrchestrator: results: dict[QuantisationType, QuantisationResult], output_repo: str, ) -> None: - """Update README with current quantisation status.""" + """Update README with current quantisation status using rate limiting.""" if not self.no_upload: - updated_readme_path = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo + # Use rate limiter to batch updates + self.readme_limiter.request_update( + self._do_readme_update, + model_source, + results, + output_repo, ) - self.uploader.upload_readme(output_repo, updated_readme_path) + + def _do_readme_update( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + output_repo: str, + ) -> None: + """Actually perform the README update (called by rate limiter).""" + updated_readme_path = self.readme_generator.generate( + model_source, results, self.models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) def _wait_for_uploads(self, upload_futures: list) -> None: """Wait for all parallel uploads to complete.""" @@ -690,7 +767,7 @@ class QuantisationOrchestrator: output_repo: str, file_path: Path, model_source: ModelSource, - results: dict[str, QuantisationResult], + results: dict[QuantisationType, QuantisationResult], ) -> None: """Upload F16 file and clean up (runs in background thread).""" try: @@ -701,7 +778,7 @@ class QuantisationOrchestrator: # Don't delete F16 yet - we still need it for quantisations # It will be deleted in _cleanup_files after all quantisations complete - results["F16"].status = "completed" + results[QuantisationType.F16].status = "completed" updated_readme_path = self.readme_generator.generate( model_source, results, self.models_dir, output_repo ) @@ -710,8 +787,8 @@ class QuantisationOrchestrator: logger.info("[PARALLEL] F16 upload complete") except Exception as e: logger.error(f"[PARALLEL] Failed to upload F16: {e}") - results["F16"].status = "failed" - results["F16"].error_message = str(e) + results[QuantisationType.F16].status = "failed" + results[QuantisationType.F16].error_message = str(e) try: updated_readme_path = self.readme_generator.generate( diff --git a/helpers/services/quantisation.py b/helpers/services/quantisation.py index a48b6f0..ae9cc6f 100644 --- a/helpers/services/quantisation.py +++ b/helpers/services/quantisation.py @@ -10,6 +10,7 @@ from __future__ import annotations import shutil import subprocess import tempfile +import time import traceback from pathlib import Path @@ -21,6 +22,7 @@ from helpers.models.quantisation import ( QuantisationType, ) from helpers.services.filesystem import FilesystemService +from helpers.services.ggml_quantise import GGMLQuantiser from helpers.services.gguf import GGUFConverter from helpers.services.llama_cpp import QuantisationExecutor from helpers.utils.config_parser import ConfigParser @@ -39,12 +41,14 @@ class QuantisationEngine: """Initialise quantisation engine.""" self.fs = FilesystemService() self.executor = QuantisationExecutor() + self.ggml_quantiser = GGMLQuantiser() def quantise(self, context: QuantisationContext) -> QuantisationResult: """Perform quantisation using the specified configuration. Executes quantisation using direct llama.cpp binary with proper - tensor override flags for L and XL variants. + tensor override flags for L and XL variants. Falls back to GGML + for basic types when architecture is unsupported. Returns: QuantisationResult with success status and file information. @@ -69,8 +73,12 @@ class QuantisationEngine: logger.info(f"šŸ“ Source: {context.f16_model_path}") logger.info(f"šŸ“ Target: {output_path}") + # Determine if this is a basic type that can use GGML + basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + is_basic_type = context.config.name in basic_types + try: - # Use direct binary execution for quantisation + # Try llama.cpp first for all types logger.info("šŸ”§ Using llama.cpp binary for quantisation...") success = self.executor.execute_quantisation( @@ -80,6 +88,23 @@ class QuantisationEngine: if success: return self._create_success_result(context.config.name, output_path, "llama.cpp") + # Check if this was an architecture error and we can use GGML fallback + if ( + hasattr(self.executor, "last_error") + and self.executor.last_error == "unsupported_architecture" + and is_basic_type + ): + logger.info("šŸ”„ Architecture unsupported - using GGML implementation...") + + success = self.ggml_quantiser.try_alternative_quantisation( + context.f16_model_path, output_path, context.config.name + ) + + if success: + return self._create_success_result( + context.config.name, output_path, "GGML numpy" + ) + logger.error(f"āŒ {context.config.name} quantisation failed") return QuantisationResult( quantisation_type=QuantisationType(context.config.name), @@ -349,16 +374,17 @@ class ModelManager: ) # Stream output line by line - for line in process.stdout: - # Log download progress lines - if line.strip(): - # Check if it's a progress line (contains %) - if "%" in line or "Downloading" in line or "Fetching" in line: - # Use info level for progress lines - logger.info(f" {line.strip()}") - else: - # Use debug for other output - logger.debug(f" {line.strip()}") + if process.stdout: + for line in process.stdout: + # Log download progress lines + if line.strip(): + # Check if it's a progress line (contains %) + if "%" in line or "Downloading" in line or "Fetching" in line: + # Use info level for progress lines + logger.info(f" {line.strip()}") + else: + # Use debug for other output + logger.debug(f" {line.strip()}") # Wait for process to complete return_code = process.wait() @@ -503,6 +529,9 @@ class HuggingFaceUploader: """ logger.info("Uploading README...") + # Add delay to prevent rate limiting + time.sleep(2) + # First ensure the repository exists self._ensure_repo_exists(output_repo) @@ -576,6 +605,9 @@ class HuggingFaceUploader: """ logger.info(f"Uploading {model_path.name}...") + # Add delay to prevent rate limiting + time.sleep(3) + # Always use huggingface-cli for model files to ensure xet backend is used try: logger.debug(f"DEBUG: Uploading model file {model_path.name} to {output_repo}") diff --git a/helpers/utils/rate_limiter.py b/helpers/utils/rate_limiter.py new file mode 100644 index 0000000..2331cd9 --- /dev/null +++ b/helpers/utils/rate_limiter.py @@ -0,0 +1,130 @@ +"""Rate limiter for README updates. + +Implements a cooldown mechanism to prevent excessive HuggingFace API calls +while ensuring all updates eventually reach the repository. +""" + +from __future__ import annotations + +import threading +import time +from typing import TYPE_CHECKING, Any + +from helpers.logger import logger + +if TYPE_CHECKING: + from collections.abc import Callable + + +class ReadmeRateLimiter: + """Rate limits README updates to prevent API throttling. + + Ensures updates are batched with a minimum interval between API calls, + while guaranteeing that pending updates are eventually applied. + """ + + def __init__(self, cooldown_seconds: float = 30.0) -> None: + """Initialise rate limiter with specified cooldown period. + + Args: + cooldown_seconds: Minimum seconds between updates (default 30). + """ + self.cooldown_seconds = cooldown_seconds + self.last_update_time = 0.0 + self.pending_update = False + self.update_lock = threading.Lock() + self.timer: threading.Timer | None = None + self.update_func: Callable[..., Any] | None = None + self.update_args: tuple[Any, ...] | None = None + self.update_kwargs: dict[str, Any] | None = None + + def request_update( + self, + update_func: Callable[..., Any], + *args: Any, + **kwargs: Any, + ) -> None: + """Request a README update, respecting rate limits. + + Updates are batched during cooldown periods and executed + when the cooldown expires. + + Args: + update_func: Function to call for the update + *args: Positional arguments for update_func + **kwargs: Keyword arguments for update_func + """ + with self.update_lock: + current_time = time.time() + time_since_last = current_time - self.last_update_time + + # Store the latest update request + self.update_func = update_func + self.update_args = args + self.update_kwargs = kwargs + + if time_since_last >= self.cooldown_seconds: + # Enough time has passed, update immediately + logger.debug(f"README update allowed (last update {time_since_last:.1f}s ago)") + self._execute_update() + else: + # Still in cooldown, schedule for later + remaining = self.cooldown_seconds - time_since_last + logger.debug(f"README update delayed ({remaining:.1f}s cooldown remaining)") + + if not self.pending_update: + # Schedule an update when cooldown expires + self.pending_update = True + if self.timer: + self.timer.cancel() + self.timer = threading.Timer(remaining, self._delayed_update) + self.timer.start() + else: + # Update already scheduled, just update the args + logger.debug("README update already scheduled, updating with latest data") + + def _execute_update(self) -> None: + """Execute the actual update (must be called with lock held).""" + if self.update_func: + try: + args = self.update_args or () + kwargs = self.update_kwargs or {} + self.update_func(*args, **kwargs) + self.last_update_time = time.time() + logger.debug("README update completed") + except Exception as e: + logger.error(f"README update failed: {e}") + + self.pending_update = False + self.update_func = None + self.update_args = None + self.update_kwargs = None + + def _delayed_update(self) -> None: + """Execute a delayed update after cooldown expires.""" + with self.update_lock: + if self.pending_update: + logger.debug("Executing delayed README update") + self._execute_update() + + def flush(self) -> None: + """Force any pending updates to execute immediately. + + Called at script end to ensure final state is uploaded. + """ + with self.update_lock: + if self.timer: + self.timer.cancel() + self.timer = None + + if self.pending_update and self.update_func: + logger.info("Flushing pending README update...") + # Wait for cooldown if needed + current_time = time.time() + time_since_last = current_time - self.last_update_time + if time_since_last < self.cooldown_seconds: + wait_time = self.cooldown_seconds - time_since_last + logger.info(f"Waiting {wait_time:.1f}s for cooldown before final update...") + time.sleep(wait_time) + + self._execute_update() diff --git a/pyproject.toml b/pyproject.toml index 4dfef4a..be3756c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,8 @@ skip-magic-trailing-comma = false [tool.ruff.lint] fixable = ["ALL"] ignore = [ + "ANN002", # type annotation for args + "ANN003", # type annotation for kwargs "ANN401", # use of Any type "BLE001", # blind Exception usage "COM812", # missing trailing comma From 21d8c03aeae71f2918bd7d4d5e76bea70ccad758512a1cdb75f7c3d1b603ccb4 Mon Sep 17 00:00:00 2001 From: Tom Foster Date: Sat, 9 Aug 2025 17:16:02 +0100 Subject: [PATCH 3/3] Refactor modules --- helpers/filesystem/__init__.py | 17 + helpers/filesystem/cleanup.py | 81 ++ .../operations.py} | 29 +- helpers/filesystem/workspace.py | 146 +++ helpers/ggml/__init__.py | 11 + .../ggml_quantise.py => ggml/quantiser.py} | 340 ++++--- helpers/gguf/__init__.py | 12 + helpers/gguf/converter.py | 216 +++++ helpers/gguf/reader.py | 231 +++++ helpers/gguf/writer.py | 374 ++++++++ helpers/huggingface/__init__.py | 19 + helpers/huggingface/client.py | 124 +++ helpers/huggingface/repository.py | 167 ++++ helpers/huggingface/uploader.py | 330 +++++++ helpers/huggingface/wrapper.py | 57 ++ helpers/llama_cpp/__init__.py | 20 + helpers/llama_cpp/architecture.py | 235 +++++ .../{services => llama_cpp}/binary_manager.py | 33 +- .../imatrix.py} | 80 +- helpers/llama_cpp/quantiser.py | 219 +++++ helpers/quantisation/__init__.py | 23 + helpers/quantisation/engine.py | 141 +++ helpers/quantisation/executor.py | 457 ++++++++++ helpers/quantisation/model_manager.py | 422 +++++++++ helpers/quantisation/orchestrator.py | 229 +++++ helpers/quantisation/profile_manager.py | 132 +++ helpers/quantisation/progress.py | 151 ++++ helpers/readme/__init__.py | 23 + helpers/readme/formatter.py | 265 ++++++ helpers/readme/generator.py | 311 +++++++ helpers/readme/templates.py | 228 +++++ helpers/services/__init__.py | 6 - helpers/services/gguf.py | 478 ---------- helpers/services/huggingface.py | 744 --------------- helpers/services/llama_cpp.py | 295 ------ helpers/services/llama_python.py | 756 ---------------- helpers/services/orchestrator.py | 846 ------------------ helpers/services/quantisation.py | 742 --------------- helpers/utils/config_parser.py | 13 +- helpers/utils/rate_limiter.py | 13 +- quantise_gguf.py | 2 +- safetensors2gguf.py | 2 +- 42 files changed, 4961 insertions(+), 4059 deletions(-) create mode 100644 helpers/filesystem/__init__.py create mode 100644 helpers/filesystem/cleanup.py rename helpers/{services/filesystem.py => filesystem/operations.py} (86%) create mode 100644 helpers/filesystem/workspace.py create mode 100644 helpers/ggml/__init__.py rename helpers/{services/ggml_quantise.py => ggml/quantiser.py} (59%) create mode 100644 helpers/gguf/__init__.py create mode 100644 helpers/gguf/converter.py create mode 100644 helpers/gguf/reader.py create mode 100644 helpers/gguf/writer.py create mode 100644 helpers/huggingface/__init__.py create mode 100644 helpers/huggingface/client.py create mode 100644 helpers/huggingface/repository.py create mode 100644 helpers/huggingface/uploader.py create mode 100644 helpers/huggingface/wrapper.py create mode 100644 helpers/llama_cpp/__init__.py create mode 100644 helpers/llama_cpp/architecture.py rename helpers/{services => llama_cpp}/binary_manager.py (92%) rename helpers/{services/imatrix_generator.py => llama_cpp/imatrix.py} (76%) create mode 100644 helpers/llama_cpp/quantiser.py create mode 100644 helpers/quantisation/__init__.py create mode 100644 helpers/quantisation/engine.py create mode 100644 helpers/quantisation/executor.py create mode 100644 helpers/quantisation/model_manager.py create mode 100644 helpers/quantisation/orchestrator.py create mode 100644 helpers/quantisation/profile_manager.py create mode 100644 helpers/quantisation/progress.py create mode 100644 helpers/readme/__init__.py create mode 100644 helpers/readme/formatter.py create mode 100644 helpers/readme/generator.py create mode 100644 helpers/readme/templates.py delete mode 100644 helpers/services/__init__.py delete mode 100644 helpers/services/gguf.py delete mode 100644 helpers/services/huggingface.py delete mode 100644 helpers/services/llama_cpp.py delete mode 100644 helpers/services/llama_python.py delete mode 100644 helpers/services/orchestrator.py delete mode 100644 helpers/services/quantisation.py diff --git a/helpers/filesystem/__init__.py b/helpers/filesystem/__init__.py new file mode 100644 index 0000000..eecc15b --- /dev/null +++ b/helpers/filesystem/__init__.py @@ -0,0 +1,17 @@ +"""Filesystem operations and management. + +Provides utilities for file cleanup, workspace management, and +directory operations throughout the quantisation workflow. +""" + +from __future__ import annotations + +from helpers.filesystem.cleanup import FileCleanup +from helpers.filesystem.operations import FilesystemService +from helpers.filesystem.workspace import WorkspaceManager + +__all__ = [ + "FileCleanup", + "FilesystemService", + "WorkspaceManager", +] diff --git a/helpers/filesystem/cleanup.py b/helpers/filesystem/cleanup.py new file mode 100644 index 0000000..0b4b922 --- /dev/null +++ b/helpers/filesystem/cleanup.py @@ -0,0 +1,81 @@ +"""File cleanup operations for the quantisation workflow. + +Manages removal of temporary files, model cleanup after processing, +and disk space recovery during quantisation operations. +""" + +from __future__ import annotations + +from shutil import rmtree as shutil_rmtree +from typing import TYPE_CHECKING + +from helpers.logger import logger + +if TYPE_CHECKING: + from pathlib import Path + + from helpers.models.quantisation import ModelSource + + +class FileCleanup: + """Handles cleanup of temporary and intermediate files. + + Provides methods for removing processed model files, temporary + conversions, and other artifacts to manage disk space efficiently + during quantisation workflows. + """ + + @staticmethod + def cleanup_files(f16_model_path: Path, model_source: ModelSource, models_dir: Path) -> None: + """Clean up temporary files after processing. + + Removes F16 model and original format files to save disk space + after successful quantisation and upload. Processes both F16 + GGUF files and original model formats to maximise storage recovery. + """ + if f16_model_path.exists(): + logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...") + f16_model_path.unlink() + + if not model_source.is_gguf_repo: + FileCleanup.cleanup_original_model(model_source, models_dir) + + @staticmethod + def cleanup_original_model(model_source: ModelSource, models_dir: Path) -> None: + """Clean up original model files after successful conversion. + + Removes SafeTensors files to save disk space whilst preserving + configuration, tokeniser, and metadata files for reference. The + design prioritises space efficiency over re-conversion capability. + """ + model_dir = models_dir / model_source.model_name + + safetensor_files = list(model_dir.glob("*.safetensors")) + if safetensor_files: + logger.info(f"Removing {len(safetensor_files)} SafeTensors files...") + for file in safetensor_files: + file.unlink() + + logger.info("Keeping config files, tokeniser, and metadata for reference") + + @staticmethod + def cleanup_quantisation_file(file_path: Path) -> None: + """Remove a single quantisation file. + + Safely removes the specified file with existence checking and + logging for disk space management during quantisation workflows. + """ + if file_path.exists(): + logger.info(f"Removing {file_path.name} to save disk space...") + file_path.unlink() + + @staticmethod + def cleanup_temp_directory(temp_dir: Path) -> None: + """Clean up a temporary directory and all its contents. + + Recursively removes the directory and all subdirectories with + error tolerance to handle locked or missing files gracefully. + """ + if temp_dir.exists() and temp_dir.is_dir(): + logger.debug(f"Cleaning up temporary directory: {temp_dir}") + shutil_rmtree(temp_dir, ignore_errors=True) diff --git a/helpers/services/filesystem.py b/helpers/filesystem/operations.py similarity index 86% rename from helpers/services/filesystem.py rename to helpers/filesystem/operations.py index 6337720..055fa61 100644 --- a/helpers/services/filesystem.py +++ b/helpers/filesystem/operations.py @@ -1,8 +1,7 @@ -"""Filesystem operations service. +"""Core filesystem operations. Provides unified filesystem operations including file discovery, size -calculation, and path management. Consolidates common filesystem patterns -used across quantisation and conversion workflows. +calculation, and path management for quantisation workflows. """ from __future__ import annotations @@ -21,8 +20,7 @@ class FilesystemService: """Handles filesystem operations with consistent error handling. Provides methods for file discovery, size formatting, and JSON loading - with proper error handling and logging. Ensures consistent behaviour - across different tools and workflows. + with proper error handling and logging. """ @staticmethod @@ -31,7 +29,7 @@ class FilesystemService: Attempts to use `du -h` for human-readable output, falling back to Python calculation if the system command fails. Provides consistent - size formatting across the toolset. + formatting across different platforms and file sizes. Returns: Human-readable file size string (e.g. "1.5G", "750M"). @@ -43,7 +41,6 @@ class FilesystemService: return result.stdout.split()[0] except (subprocess.CalledProcessError, FileNotFoundError): # Fallback to Python calculation - try: size_bytes: float = float(file_path.stat().st_size) for unit in ["B", "K", "M", "G", "T"]: @@ -60,8 +57,7 @@ class FilesystemService: """Load and parse JSON configuration file. Provides consistent JSON loading with proper error handling and - encoding specification. Used for loading model configurations, - tokeniser settings, and other JSON-based metadata. + UTF-8 encoding specification for cross-platform compatibility. Returns: Parsed JSON content as dictionary. @@ -81,9 +77,8 @@ class FilesystemService: """Find all SafeTensor files in model directory using priority search. Searches for tensor files in order of preference: single model.safetensors, - sharded model-*-of-*.safetensors files, then any *.safetensors files. This - approach handles both single-file and multi-shard model distributions whilst - ensuring predictable file ordering for conversion consistency. + sharded model-*-of-*.safetensors files, then any *.safetensors files. + The prioritisation ensures optimal handling of different model formats. Returns: List of SafeTensor file paths in priority order. @@ -116,7 +111,7 @@ class FilesystemService: Searches for GGUF files with optional pattern matching. Prioritises multi-part files (00001-of-*) over single files for proper handling - of large models split across multiple files. + of sharded model architectures. Returns: List of GGUF file paths, sorted with multi-part files first. @@ -140,8 +135,8 @@ class FilesystemService: def ensure_directory(path: Path) -> Path: """Ensure directory exists, creating if necessary. - Creates directory and all parent directories if they don't exist. - Returns the path for method chaining convenience. + Creates directory and all parent directories if they don't exist, + using atomic operations to handle concurrent access gracefully. Returns: The directory path. @@ -153,8 +148,8 @@ class FilesystemService: def cleanup_directory(path: Path, pattern: str = "*") -> int: """Remove files matching pattern from directory. - Safely removes files matching the specified glob pattern. Returns - count of files removed for logging purposes. + Safely removes files matching the specified glob pattern with + comprehensive error handling to prevent workflow interruption. Returns: Number of files removed. diff --git a/helpers/filesystem/workspace.py b/helpers/filesystem/workspace.py new file mode 100644 index 0000000..b13f959 --- /dev/null +++ b/helpers/filesystem/workspace.py @@ -0,0 +1,146 @@ +"""Workspace management for quantisation operations. + +Manages working directories, model storage paths, and temporary +file locations throughout the quantisation workflow. +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path +from shutil import disk_usage as shutil_disk_usage, rmtree as shutil_rmtree + +from helpers.logger import logger + + +class WorkspaceManager: + """Manages workspace directories and paths. + + Provides centralised management of working directories, model + storage, and temporary file locations with automatic directory + creation and validation. + """ + + def __init__(self, work_dir: Path | None = None) -> None: + """Initialise workspace manager. + + Sets up base working directory structure with models and temporary + file directories. Defaults to quantisation_work in current directory + if no path is specified. + """ + self.work_dir = work_dir or Path.cwd() / "quantisation_work" + self.models_dir = self.work_dir / "models" + self._setup_directories() + + def _setup_directories(self) -> None: + """Create necessary workspace directories.""" + self.work_dir.mkdir(parents=True, exist_ok=True) + self.models_dir.mkdir(parents=True, exist_ok=True) + logger.debug(f"Workspace initialised at: {self.work_dir}") + + def get_model_dir(self, model_name: str) -> Path: + """Get directory path for a specific model. + + Creates the model directory if it doesn't exist and returns the path + for storing model files and quantisation outputs. + + Returns: + Path to model directory. + """ + model_dir = self.models_dir / model_name + model_dir.mkdir(parents=True, exist_ok=True) + return model_dir + + def get_temp_dir(self, prefix: str = "temp") -> Path: + """Get a temporary directory path within workspace. + + Creates a unique temporary directory with specified prefix within + the workspace for intermediate processing files. + + Returns: + Path to temporary directory. + """ + return Path(tempfile.mkdtemp(prefix=f"{prefix}_", dir=self.work_dir)) + + def get_imatrix_dir(self, model_name: str) -> Path: + """Get directory for importance matrix files. + + Creates and returns the path to the imatrix directory for storing + importance matrices used in advanced quantisation methods. + + Returns: + Path to imatrix directory. + """ + imatrix_dir = self.models_dir / model_name / "imatrix" + imatrix_dir.mkdir(parents=True, exist_ok=True) + return imatrix_dir + + def get_quantisation_output_path( + self, + model_name: str, + author: str, + quant_type: str, + ) -> Path: + """Get output path for a quantised model. + + Constructs standardised filename and path for quantised model output + using author-model-quantisation format for consistent naming. + + Returns: + Path for quantised model output. + """ + model_dir = self.get_model_dir(model_name) + filename = f"{author}-{model_name}-{quant_type}.gguf" + return model_dir / filename + + def cleanup_workspace(self) -> None: + """Clean up entire workspace directory.""" + if self.work_dir.exists(): + logger.info(f"Cleaning up workspace: {self.work_dir}") + shutil_rmtree(self.work_dir, ignore_errors=True) + + @property + def disk_usage(self) -> dict[str, float]: + """Get disk usage statistics for workspace. + + Returns: + Dictionary with size in GB for work_dir and models_dir. + """ + + def get_dir_size(path: Path) -> float: + """Calculate total size of directory in GB. + + Recursively traverses directory tree to calculate total file + sizes with GB conversion for human-readable output. + + Returns: + Total size of directory in GB. + """ + total = 0 + if path.exists(): + for item in path.rglob("*"): + if item.is_file(): + total += item.stat().st_size + return total / (1024**3) # Convert to GB + + return { + "work_dir": get_dir_size(self.work_dir), + "models_dir": get_dir_size(self.models_dir), + } + + def validate_space(self, required_gb: float = 50.0) -> bool: + """Check if sufficient disk space is available. + + Validates available disk space against required threshold, logging + warnings when space is insufficient for quantisation operations. + + Returns: + True if sufficient space available. + """ + stat = shutil_disk_usage(self.work_dir) + free_gb = stat.free / (1024**3) + + if free_gb < required_gb: + logger.warning(f"Low disk space: {free_gb:.1f}GB free, {required_gb:.1f}GB recommended") + return False + return True diff --git a/helpers/ggml/__init__.py b/helpers/ggml/__init__.py new file mode 100644 index 0000000..dd55b9d --- /dev/null +++ b/helpers/ggml/__init__.py @@ -0,0 +1,11 @@ +"""GGML quantisation operations. + +Provides numpy-based GGML block quantisation for architectures +not supported by llama.cpp. +""" + +from __future__ import annotations + +from helpers.ggml.quantiser import GGMLQuantiser + +__all__ = ["GGMLQuantiser"] diff --git a/helpers/services/ggml_quantise.py b/helpers/ggml/quantiser.py similarity index 59% rename from helpers/services/ggml_quantise.py rename to helpers/ggml/quantiser.py index 02f17cf..93fd846 100644 --- a/helpers/services/ggml_quantise.py +++ b/helpers/ggml/quantiser.py @@ -9,13 +9,13 @@ from __future__ import annotations import struct import traceback -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import gguf import numpy as np +from helpers.filesystem import FilesystemService from helpers.logger import logger -from helpers.services.filesystem import FilesystemService if TYPE_CHECKING: from pathlib import Path @@ -48,6 +48,193 @@ class GGMLQuantiser: """ return ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + def _extract_architecture_string(self, arch_field: Any) -> str: + """Extract architecture string from GGUF field data. + + Handles various formats of architecture field storage in GGUF files. + + Returns: + Architecture string or 'unknown' if extraction fails. + """ + if not arch_field: + return "unknown" + + if hasattr(arch_field, "parts") and arch_field.parts: + return self._extract_from_parts_array(arch_field) + if hasattr(arch_field, "data"): + return self._extract_from_data_field(arch_field.data) + + return "unknown" + + def _extract_from_parts_array(self, arch_field: Any) -> str: + """Extract architecture from GGUF parts array format. + + Returns: + Architecture string or 'unknown' if extraction fails. + """ + if len(arch_field.data) == 0: + return "unknown" + + idx = arch_field.data[0] if isinstance(arch_field.data, (list, tuple)) else arch_field.data + + if idx >= len(arch_field.parts): + return "unknown" + + return self._decode_part(arch_field.parts[idx]) + + def _decode_part(self, arch_part: Any) -> str: + """Decode architecture part to string. + + Returns: + Decoded string representation. + """ + if isinstance(arch_part, bytes): + return arch_part.decode("utf-8") + if isinstance(arch_part, str): + return arch_part + if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0: + # Handle nested format + if isinstance(arch_part[0], bytes): + return arch_part[0].decode("utf-8") + return str(arch_part[0]) + return str(arch_part) + + def _extract_from_data_field(self, data: Any) -> str: + """Extract architecture from GGUF data field. + + Returns: + Architecture string or 'unknown' if extraction fails. + """ + if isinstance(data, np.ndarray): + # It's a numpy array of bytes - convert to string + try: + return bytes(data).decode("utf-8") + except (UnicodeDecodeError, ValueError): + # If that fails, try converting as ASCII values + return "".join(chr(c) for c in data if c < 128) + elif isinstance(data, bytes): + return data.decode("utf-8") + elif isinstance(data, str): + return data + else: + return str(data) + + def _copy_metadata_fields(self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter) -> None: + """Copy metadata fields from reader to writer, excluding file type.""" + logger.info("šŸ“‹ Copying metadata...") + + for key, field in reader.fields.items(): + # Skip the file type field - we'll set our own + if key == "general.file_type": + continue + + # Handle different field types + if field.types: + field_type = field.types[0] + field_data = field.parts[field.data[0]] if field.parts else field.data + + self._copy_field_by_type(writer, key, field_type, field_data, field) + + def _copy_field_by_type( + self, + writer: gguf.GGUFWriter, + key: str, + field_type: gguf.GGUFValueType, + field_data: Any, + field: Any, + ) -> None: + """Copy a single field based on its type.""" + if field_type == gguf.GGUFValueType.STRING: + # Handle both bytes and string types + string_val = field_data[0] + if isinstance(string_val, bytes): + string_val = string_val.decode("utf-8") + elif isinstance(string_val, int): + string_val = str(string_val) + writer.add_string(key, string_val) + elif field_type == gguf.GGUFValueType.UINT32: + writer.add_uint32(key, int(field.data[0])) + elif field_type == gguf.GGUFValueType.FLOAT32: + writer.add_float32(key, float(field.data[0])) + elif field_type == gguf.GGUFValueType.BOOL: + writer.add_bool(key, bool(field.data[0])) + elif field_type == gguf.GGUFValueType.ARRAY: + writer.add_array(key, field.data) + else: + # Skip unsupported field types for now + # Future enhancement: Handle additional GGUF field types as needed + pass + + def _get_file_type_mapping(self) -> dict[str, gguf.GGMLQuantizationType]: + """Get mapping from quantisation type strings to GGML enums. + + Returns: + Mapping from quantisation type strings to GGML enums. + """ + return { + "Q4_0": gguf.GGMLQuantizationType.Q4_0, + "Q5_0": gguf.GGMLQuantizationType.Q5_0, + "Q6_0": gguf.GGMLQuantizationType.Q6_K, # Q6_0 uses Q6_K enum + "Q8_0": gguf.GGMLQuantizationType.Q8_0, + } + + def _process_tensor_list( + self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter, quant_type: str + ) -> None: + """Process all tensors for quantisation.""" + logger.info(f"šŸ”„ Quantising {len(reader.tensors)} tensors to {quant_type}...") + + for i, tensor in enumerate(reader.tensors): + if i % 50 == 0: + logger.info(f" Processing tensor {i}/{len(reader.tensors)}...") + + self._process_single_tensor(tensor, writer, quant_type) + + def _process_single_tensor(self, tensor: Any, writer: gguf.GGUFWriter, quant_type: str) -> None: + """Process a single tensor for quantisation or preserve as-is.""" + # Get tensor info + name = tensor.name + shape = list(tensor.shape) + data = tensor.data + + # Determine if this tensor should be quantised + should_quantise = self._should_quantise_tensor(name) + + if not should_quantise: + # Keep original format + writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type) + else: + # Quantise the tensor + try: + quantised_data, quant_dtype = self._quantise_tensor( + data, tensor.tensor_type, shape, quant_type + ) + writer.add_tensor(name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype) + except ValueError as e: + # If quantization fails due to shape issues, keep original + logger.warning(f" āš ļø Cannot quantise {name}: {e}") + logger.warning(" Keeping in original format") + writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type) + + def _write_output_file(self, writer: gguf.GGUFWriter, output_path: Path) -> bool: + """Write the final GGUF file and verify creation. + + Returns: + True if successful, False otherwise + """ + logger.info(f"šŸ’¾ Writing {output_path.name}...") + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file() + writer.close() + + if output_path.exists(): + file_size = self.fs.get_file_size(output_path) + logger.info(f"āœ… GGML quantisation complete: {file_size}") + return True + logger.error("āŒ Output file was not created") + return False + def quantise_basic( self, input_path: Path, @@ -57,12 +244,8 @@ class GGMLQuantiser: """Perform GGML block quantisation on a GGUF file. Reads a GGUF file, quantises all tensors using the specified - quantisation type, and writes a new GGUF file. - - Args: - input_path: Path to input F16/F32 GGUF file - output_path: Path for output quantised GGUF file - quant_type: Quantisation type (Q4_0, Q5_0, Q8_0) + quantisation type, and writes a new GGUF file. Implements proper + GGML block formats for architecture-agnostic quantisation. Returns: True if successful, False otherwise @@ -80,145 +263,28 @@ class GGMLQuantiser: reader = gguf.GGUFReader(str(input_path)) # Create output writer with same architecture - arch = reader.fields.get("general.architecture") - arch_str = "unknown" - - if arch: - # The architecture field can be in different formats - if hasattr(arch, "parts") and arch.parts: - # GGUF stores strings as indices into the parts array - if len(arch.data) > 0: - # Get the index from data - idx = arch.data[0] if isinstance(arch.data, (list, tuple)) else arch.data - - # Get the actual string from parts - if idx < len(arch.parts): - arch_part = arch.parts[idx] - - # Handle different formats - if isinstance(arch_part, bytes): - arch_str = arch_part.decode("utf-8") - elif isinstance(arch_part, str): - arch_str = arch_part - elif isinstance(arch_part, (list, tuple)) and len(arch_part) > 0: - # Sometimes it's nested - if isinstance(arch_part[0], bytes): - arch_str = arch_part[0].decode("utf-8") - else: - arch_str = str(arch_part[0]) - else: - arch_str = str(arch_part) - elif hasattr(arch, "data"): - # Sometimes the data is the string directly as bytes/array - if isinstance(arch.data, np.ndarray): - # It's a numpy array of bytes - convert to string - try: - arch_str = bytes(arch.data).decode("utf-8") - except (UnicodeDecodeError, ValueError): - # If that fails, try converting as ASCII values - arch_str = "".join(chr(c) for c in arch.data if c < 128) - elif isinstance(arch.data, bytes): - arch_str = arch.data.decode("utf-8") - elif isinstance(arch.data, str): - arch_str = arch.data - else: - arch_str = str(arch.data) + arch_field = reader.fields.get("general.architecture") + arch_str = self._extract_architecture_string(arch_field) logger.info(f"šŸ“ Architecture: {arch_str}") writer = gguf.GGUFWriter(str(output_path), arch_str) # Copy all metadata - logger.info("šŸ“‹ Copying metadata...") - for key, field in reader.fields.items(): - # Skip the file type field - we'll set our own - if key == "general.file_type": - continue - - # Handle different field types - if field.types: - field_type = field.types[0] - field_data = field.parts[field.data[0]] if field.parts else field.data - - if field_type == gguf.GGUFValueType.STRING: - # Handle both bytes and string types - string_val = field_data[0] - if isinstance(string_val, bytes): - string_val = string_val.decode("utf-8") - elif isinstance(string_val, int): - string_val = str(string_val) - writer.add_string(key, string_val) - elif field_type == gguf.GGUFValueType.UINT32: - writer.add_uint32(key, int(field.data[0])) - elif field_type == gguf.GGUFValueType.FLOAT32: - writer.add_float32(key, float(field.data[0])) - elif field_type == gguf.GGUFValueType.BOOL: - writer.add_bool(key, bool(field.data[0])) - elif field_type == gguf.GGUFValueType.ARRAY: - writer.add_array(key, field.data) - else: - # Skip unsupported field types for now - # TODO(tom): Handle other field types appropriately - pass + self._copy_metadata_fields(reader, writer) # Set file type based on quantisation - file_type_map = { - "Q4_0": gguf.GGMLQuantizationType.Q4_0, - "Q5_0": gguf.GGMLQuantizationType.Q5_0, - "Q6_0": gguf.GGMLQuantizationType.Q6_K, # Q6_0 uses Q6_K enum - "Q8_0": gguf.GGMLQuantizationType.Q8_0, - } + file_type_map = self._get_file_type_mapping() writer.add_file_type(file_type_map[quant_type]) # Process tensors - logger.info(f"šŸ”„ Quantising {len(reader.tensors)} tensors to {quant_type}...") - - for i, tensor in enumerate(reader.tensors): - if i % 50 == 0: - logger.info(f" Processing tensor {i}/{len(reader.tensors)}...") - - # Get tensor info - name = tensor.name - shape = list(tensor.shape) - data = tensor.data - - # Determine if this tensor should be quantised - # Some tensors (like embeddings tokens) should stay in original format - should_quantise = self._should_quantise_tensor(name) - - if not should_quantise: - # Keep original format - writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type) - else: - # Quantise the tensor - try: - quantised_data, quant_dtype = self._quantise_tensor( - data, tensor.tensor_type, shape, quant_type - ) - writer.add_tensor( - name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype - ) - except ValueError as e: - # If quantization fails due to shape issues, keep original - logger.warning(f" āš ļø Cannot quantise {name}: {e}") - logger.warning(" Keeping in original format") - writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type) + self._process_tensor_list(reader, writer, quant_type) # Write the output file - logger.info(f"šŸ’¾ Writing {output_path.name}...") - writer.write_header_to_file() - writer.write_kv_data_to_file() - writer.write_tensors_to_file() - writer.close() + return self._write_output_file(writer, output_path) - if output_path.exists(): - file_size = self.fs.get_file_size(output_path) - logger.info(f"āœ… GGML quantisation complete: {file_size}") - return True except Exception as e: logger.error(f"āŒ GGML quantisation failed: {e}\n{traceback.format_exc()}") - else: - logger.error("āŒ Output file was not created") - return False + return False def _should_quantise_tensor(self, tensor_name: str) -> bool: """Determine if a tensor should be quantised. @@ -488,13 +554,9 @@ class GGMLQuantiser: ) -> bool: """Try basic quantisation for unsupported architectures. - For architectures not supported by llama.cpp, use our GGML implementation - to provide basic quantisation formats. - - Args: - input_path: Input GGUF file path - output_path: Output GGUF file path - target_type: Original quantisation type requested + For architectures not supported by llama.cpp, uses GGML implementation + to provide basic quantisation formats as fallback. Handles only basic + types that can be generated with numpy-based GGML quantisation. Returns: True if successful, False otherwise diff --git a/helpers/gguf/__init__.py b/helpers/gguf/__init__.py new file mode 100644 index 0000000..45cff05 --- /dev/null +++ b/helpers/gguf/__init__.py @@ -0,0 +1,12 @@ +"""GGUF file operations. + +Provides reading, writing, and conversion utilities for GGUF format files. +""" + +from __future__ import annotations + +from helpers.gguf.converter import GGUFConverter +from helpers.gguf.reader import GGUFReader +from helpers.gguf.writer import GGUFWriter + +__all__ = ["GGUFConverter", "GGUFReader", "GGUFWriter"] diff --git a/helpers/gguf/converter.py b/helpers/gguf/converter.py new file mode 100644 index 0000000..5dc59b9 --- /dev/null +++ b/helpers/gguf/converter.py @@ -0,0 +1,216 @@ +"""SafeTensors to GGUF conversion. + +Handles conversion of SafeTensors models to GGUF format with proper +metadata and tensor mapping. +""" + +from __future__ import annotations + +import gc +import json +import traceback +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import torch +from safetensors import safe_open + +from helpers.filesystem import FilesystemService +from helpers.gguf.writer import GGUFWriter +from helpers.logger import logger + +if TYPE_CHECKING: + from helpers.models.conversion import ModelConfig + from helpers.utils.tensor_mapping import TensorMapper + + +class GGUFConverter: + """High-level GGUF conversion orchestrator. + + Coordinates the complete conversion workflow from source models to GGUF + format, managing metadata extraction, tensor mapping, and file writing. + """ + + @staticmethod + def convert_safetensors( + model_path: Path, + output_path: Path, + model_config: ModelConfig, + architecture: str, + tensor_mapper: TensorMapper, + ) -> bool: + """Convert SafeTensors model to GGUF format. + + Orchestrates the conversion process including metadata setup, tensor + loading with BFloat16 support, name mapping, and tokeniser integration. + + Returns: + True if conversion successful, False otherwise. + """ + logger.info(f"Converting {model_path.name} to GGUF...") + + # Create writer + writer_wrapper = GGUFWriter(output_path, architecture) + + # Add metadata + writer_wrapper.add_metadata(model_config, model_path.name) + + # Add vision metadata if present + if model_config.vision_config: + writer_wrapper.add_vision_metadata(model_config.vision_config) + + # Load and add tensors + fs = FilesystemService() + tensor_files = fs.find_safetensor_files(model_path) + logger.info(f"Found {len(tensor_files)} tensor file(s)") + + tensor_count = 0 + for tensor_file in tensor_files: + logger.info(f"Loading {tensor_file.name}...") + with safe_open(tensor_file, framework="pt") as f: + for tensor_name in f.keys(): # noqa: SIM118 + tensor_data = f.get_tensor(tensor_name) + + # Convert BFloat16 to Float32 + if hasattr(tensor_data, "numpy"): + if torch and tensor_data.dtype == torch.bfloat16: + tensor_data = tensor_data.float() + numpy_data = tensor_data.numpy() + else: + # Already numpy + numpy_data = tensor_data + + # Map tensor name + gguf_name = tensor_mapper.map_tensor_name(tensor_name) + if not gguf_name: + logger.debug(f"Skipping unmapped tensor: {tensor_name}") + continue + + logger.debug(f" {tensor_name} -> {gguf_name}") + writer_wrapper.add_tensor(gguf_name, numpy_data) + tensor_count += 1 + + # Clean up memory after each file + gc.collect() + if torch and torch.cuda.is_available(): + torch.cuda.empty_cache() + + logger.info(f"Added {tensor_count} tensors") + + # Add tokeniser + tokeniser_config = GGUFConverter.load_tokeniser_config(model_path) + if tokeniser_config: + writer_wrapper.add_tokeniser(tokeniser_config) + writer_wrapper.add_tokeniser_vocabulary(model_path) + + # Finalise and write + writer_wrapper.write() + + # Clean up + del writer_wrapper + gc.collect() + + return output_path.exists() + + @staticmethod + def convert_pytorch( + model_path: Path, + output_path: Path, + model_config: ModelConfig, + architecture: str, + tensor_mapper: TensorMapper, + ) -> bool: + """Convert PyTorch model to GGUF format. + + Handles PyTorch bin file conversion with sharded model support, + BFloat16 compatibility, and proper memory management. + + Returns: + True if conversion successful, False otherwise. + """ + logger.info(f"Converting {model_path.name} to GGUF...") + + # Create writer + writer_wrapper = GGUFWriter(output_path, architecture) + + # Add metadata + writer_wrapper.add_metadata(model_config, model_path.name) + + # Load and add tensors + fs = FilesystemService() + model_files = fs.find_safetensor_files(model_path) + logger.info(f"Found {len(model_files)} model file(s)") + + tensor_count = 0 + for model_file in model_files: + logger.info(f"Loading {model_file.name}...") + try: + checkpoint = torch.load(model_file, map_location="cpu", weights_only=True) + + for tensor_name, tensor_data in checkpoint.items(): + # Convert to numpy + if hasattr(tensor_data, "numpy"): + if tensor_data.dtype == torch.bfloat16: + converted_tensor = tensor_data.float() + else: + converted_tensor = tensor_data + numpy_data = converted_tensor.numpy() + else: + numpy_data = tensor_data + + # Map tensor name + gguf_name = tensor_mapper.map_tensor_name(tensor_name) + if not gguf_name: + logger.debug(f"Skipping unmapped tensor: {tensor_name}") + continue + + logger.debug(f" {tensor_name} -> {gguf_name}") + writer_wrapper.add_tensor(gguf_name, numpy_data) + tensor_count += 1 + + # Clean up checkpoint + del checkpoint + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e: + logger.error(f"Failed to load {model_file.name}: {e}") + logger.error(traceback.format_exc()) + return False + + logger.info(f"Added {tensor_count} tensors") + + # Add tokeniser + tokeniser_config = GGUFConverter.load_tokeniser_config(model_path) + if tokeniser_config: + writer_wrapper.add_tokeniser(tokeniser_config) + writer_wrapper.add_tokeniser_vocabulary(model_path) + + # Finalise and write + writer_wrapper.write() + + # Clean up + del writer_wrapper + gc.collect() + + return output_path.exists() + + @staticmethod + def load_tokeniser_config(model_path: Path) -> dict[str, Any] | None: + """Load tokeniser configuration from model directory. + + Returns: + Tokeniser configuration dictionary or None if not found. + """ + config_path = model_path / "tokenizer_config.json" + if not config_path.exists(): + logger.warning("tokenizer_config.json not found") + return None + + try: + with Path(config_path).open(encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.error(f"Failed to load tokeniser config: {e}") + return None diff --git a/helpers/gguf/reader.py b/helpers/gguf/reader.py new file mode 100644 index 0000000..babdc47 --- /dev/null +++ b/helpers/gguf/reader.py @@ -0,0 +1,231 @@ +"""GGUF file reading operations. + +Provides utilities for reading and extracting information from GGUF files. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import gguf +import numpy as np + +from helpers.logger import logger + +if TYPE_CHECKING: + from pathlib import Path + + +class GGUFReader: + """Reads and extracts information from GGUF files. + + Provides methods to read metadata, architecture information, and tensors + from existing GGUF files for inspection or re-quantisation. + """ + + def __init__(self, file_path: Path) -> None: + """Initialise GGUF reader with file path. + + Sets up the internal GGUF reader instance for subsequent metadata + and tensor extraction operations on the specified file. + """ + self.file_path = file_path + self.reader = gguf.GGUFReader(str(file_path)) + + def get_architecture(self) -> str: + """Extract architecture string from GGUF file. + + Returns: + Architecture string or "unknown" if not found. + """ + arch = self.reader.fields.get("general.architecture") + if not arch: + return "unknown" + + # Try extracting from parts array format + if hasattr(arch, "parts") and arch.parts: + return self._extract_from_parts(arch) + + # Try extracting from data field directly + if hasattr(arch, "data"): + return self._extract_from_data(arch.data) + + return "unknown" + + def _extract_from_parts(self, arch: Any) -> str: + """Extract architecture from parts array. + + Returns: + Architecture string or "unknown". + """ + if len(arch.data) == 0: + return "unknown" + + # Get index and validate + idx = arch.data[0] if isinstance(arch.data, (list, tuple)) else arch.data + if idx >= len(arch.parts): + return "unknown" + + return self._decode_arch_part(arch.parts[idx]) + + def _decode_arch_part(self, arch_part: Any) -> str: + """Decode architecture part to string. + + Returns: + Decoded architecture string. + """ + if isinstance(arch_part, bytes): + return arch_part.decode("utf-8") + if isinstance(arch_part, str): + return arch_part + if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0: + # Handle nested format + if isinstance(arch_part[0], bytes): + return arch_part[0].decode("utf-8") + return str(arch_part[0]) + return str(arch_part) + + def _extract_from_data(self, data: Any) -> str: + """Extract architecture from data field. + + Returns: + Architecture string or "unknown". + """ + if isinstance(data, np.ndarray): + # Convert numpy array of bytes to string + try: + return bytes(data).decode("utf-8") + except (UnicodeDecodeError, ValueError): + # Fallback to ASCII conversion + return "".join(chr(c) for c in data if c < 128) + if isinstance(data, bytes): + return data.decode("utf-8") + if isinstance(data, str): + return data + return str(data) + + def get_metadata(self) -> dict[str, Any]: + """Extract all metadata from GGUF file. + + Returns: + Dictionary of metadata fields and values. + """ + metadata: dict[str, Any] = {} + + for key, field in self.reader.fields.items(): + if field.types and field.data: + field_type = field.types[0] + field_data = field.parts[field.data[0]] if field.parts else field.data + + # Convert data based on type + if field_type == gguf.GGUFValueType.STRING: + if isinstance(field_data, (list, tuple)) and field_data: + string_value = field_data[0] + if isinstance(string_value, bytes): + string_value = string_value.decode("utf-8") + metadata[key] = string_value + else: + metadata[key] = str(field_data) + elif field_type in { + gguf.GGUFValueType.UINT32, + gguf.GGUFValueType.INT32, + gguf.GGUFValueType.FLOAT32, + gguf.GGUFValueType.BOOL, + }: + metadata[key] = ( + field.data[0] if isinstance(field.data, (list, tuple)) else field.data + ) + elif field_type == gguf.GGUFValueType.ARRAY: + metadata[key] = list(field.data) + + return metadata + + def get_tensor_info(self) -> list[dict[str, Any]]: + """Get information about all tensors in the file. + + Returns: + List of tensor info dictionaries with name, shape, and type. + """ + tensor_info = [] + + for tensor in self.reader.tensors: + info = { + "name": tensor.name, + "shape": list(tensor.shape), + "type": tensor.tensor_type.name + if hasattr(tensor.tensor_type, "name") + else str(tensor.tensor_type), + "size_bytes": tensor.data.nbytes + if hasattr(tensor.data, "nbytes") + else len(tensor.data), + } + tensor_info.append(info) + + return tensor_info + + def get_quantisation_type(self) -> str | None: + """Get the quantisation type of the GGUF file. + + Returns: + Quantisation type string or None if not found. + """ + file_type = self.reader.fields.get("general.file_type") + + if file_type and hasattr(file_type, "data"): + # Map numeric file type to string + file_type_value = ( + file_type.data[0] if isinstance(file_type.data, (list, tuple)) else file_type.data + ) + + # Common file type mappings + file_type_map = { + 0: "F32", + 1: "F16", + 2: "Q4_0", + 3: "Q4_1", + 7: "Q8_0", + 8: "Q5_0", + 9: "Q5_1", + 10: "Q2_K", + 11: "Q3_K_S", + 12: "Q3_K_M", + 13: "Q3_K_L", + 14: "Q4_K_S", + 15: "Q4_K_M", + 16: "Q5_K_S", + 17: "Q5_K_M", + 18: "Q6_K", + } + + return file_type_map.get(int(file_type_value), f"Unknown ({file_type_value})") + + return None + + def validate(self) -> bool: + """Validate that the GGUF file is properly formatted. + + Returns: + True if file is valid, False otherwise. + """ + try: + # Check basic structure + if not self.reader.fields: + logger.error("No metadata fields found") + return False + + # Check for required fields + required_fields = ["general.architecture"] + for field in required_fields: + if field not in self.reader.fields: + logger.error(f"Missing required field: {field}") + return False + + # Check tensors + if not self.reader.tensors: + logger.warning("No tensors found in file") + + except Exception as e: + logger.error(f"Validation failed: {e}") + return False + else: + return True diff --git a/helpers/gguf/writer.py b/helpers/gguf/writer.py new file mode 100644 index 0000000..f020a37 --- /dev/null +++ b/helpers/gguf/writer.py @@ -0,0 +1,374 @@ +"""GGUF file writing operations. + +Provides high-level interface for creating GGUF files with metadata, +tensors, and tokeniser information. +""" + +from __future__ import annotations + +import json +import operator +import traceback +from pathlib import Path +from typing import TYPE_CHECKING, Any, Protocol + +import gguf + +from helpers.logger import logger + +if TYPE_CHECKING: + import numpy as np + + from helpers.models.conversion import ModelConfig + + +class VisionConfig(Protocol): + """Protocol for vision model configuration.""" + + hidden_size: int + num_hidden_layers: int + num_attention_heads: int + intermediate_size: int + patch_size: int + spatial_merge_size: int + + +class GGUFWriter: + """Manages GGUF file creation and metadata writing. + + Provides high-level interface for GGUF file operations including metadata + configuration, tensor addition, and tokeniser integration. Encapsulates + low-level GGUF library interactions for consistent error handling. + """ + + def __init__(self, output_path: Path, architecture: str) -> None: + """Initialise GGUF writer with output path and architecture. + + Creates the underlying GGUF writer instance and prepares for metadata + and tensor addition. Sets up the file structure for the specified + model architecture. + """ + self.output_path = output_path + self.architecture = architecture + self.writer = gguf.GGUFWriter(str(output_path), architecture) + logger.info(f"Created GGUF writer for {architecture} architecture") + + def add_metadata(self, model_config: ModelConfig, model_name: str) -> None: + """Add comprehensive metadata from model configuration. + + Writes general model information, architectural parameters, and + quantisation settings to the GGUF file header. Handles both standard + and vision model configurations with appropriate parameter mapping. + """ + # General metadata + self.writer.add_name(model_name) + self.writer.add_description(f"Converted from {model_config.architectures[0]}") + self.writer.add_file_type(gguf.LlamaFileType.ALL_F32) + + # Log architecture being used + logger.info(f"Setting GGUF architecture: {self.architecture}") + if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}: + logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp") + + # Model parameters from config + params = model_config.to_gguf_params() + self.writer.add_context_length(params.context_length) + self.writer.add_embedding_length(params.embedding_length) + self.writer.add_block_count(params.block_count) + self.writer.add_feed_forward_length(params.feed_forward_length) + self.writer.add_head_count(params.attention_head_count) + self.writer.add_head_count_kv(params.attention_head_count_kv) + self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon) + self.writer.add_rope_freq_base(params.rope_freq_base) + self.writer.add_rope_dimension_count(params.rope_dimension_count) + + logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context") + + def add_vision_metadata(self, vision_config: VisionConfig | None) -> None: + """Add vision model parameters to GGUF metadata. + + Configures vision-specific parameters for multimodal models including + embedding dimensions, attention heads, and spatial processing settings. + """ + if not vision_config: + return + + logger.info("Adding vision model parameters...") + self.writer.add_vision_embedding_length(vision_config.hidden_size) + self.writer.add_vision_block_count(vision_config.num_hidden_layers) + self.writer.add_vision_head_count(vision_config.num_attention_heads) + self.writer.add_vision_feed_forward_length(vision_config.intermediate_size) + self.writer.add_vision_patch_size(vision_config.patch_size) + self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size) + + if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps: + self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps) + + def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None: + """Add tokeniser metadata to GGUF file. + + Writes special token IDs and tokeniser model type to enable proper + text processing during inference. Uses sensible defaults for missing + configuration values. + """ + self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1)) + self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2)) + self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0)) + self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0)) + + # Add BOS/EOS token addition flags if available + if "add_bos_token" in tokeniser_config: + self.writer.add_add_bos_token(tokeniser_config["add_bos_token"]) + if "add_eos_token" in tokeniser_config: + self.writer.add_add_eos_token(tokeniser_config["add_eos_token"]) + + # Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type + + logger.info("Added tokeniser configuration") + + def add_tokeniser_vocabulary(self, model_path: Path) -> None: + """Add full tokeniser vocabulary to GGUF file. + + Loads and embeds the complete tokeniser vocabulary including tokens, + merges, and scores to enable standalone model usage without external + tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers. + """ + tokenizer_path = model_path / "tokenizer.json" + if not tokenizer_path.exists(): + logger.warning("tokenizer.json not found, skipping vocabulary embedding") + return + + try: + with Path(tokenizer_path).open(encoding="utf-8") as f: + tokenizer_data = json.load(f) + + model_data = tokenizer_data.get("model", {}) + model_type = model_data.get("type", "") + + # Get pre-tokenizer information + pre_tokenizer = tokenizer_data.get("pre_tokenizer", {}) + pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer) + + # Get added tokens + added_tokens = tokenizer_data.get("added_tokens", []) + + if model_type == "BPE": + self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type) + elif model_type == "Unigram": + self._add_unigram_tokenizer(model_data, added_tokens) + elif model_type == "WordPiece": + self._add_wordpiece_tokenizer(model_data, added_tokens) + else: + logger.warning(f"Unsupported tokenizer type: {model_type}") + # Try to add as generic tokenizer + self._add_generic_tokenizer(model_data, tokenizer_data) + + except Exception as e: + logger.error(f"Failed to load tokeniser vocabulary: {e}") + logger.error(traceback.format_exc()) + + def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str: + """Determine pre-tokenizer type from configuration. + + Returns: + Pre-tokenizer type. + """ + if not pre_tokenizer: + return "default" + + # Check for various pre-tokenizer types + pre_type = pre_tokenizer.get("type", "") + if "ByteLevel" in str(pre_type): + return "llama3" + if "Metaspace" in str(pre_type): + return "default" + + return "default" + + def _add_bpe_tokenizer( + self, + model_data: dict[str, Any], + added_tokens: list[dict[str, Any]], + pre_tokenizer_type: str, + ) -> None: + """Add BPE tokenizer to GGUF file.""" + vocab = model_data.get("vocab", {}) + merges = model_data.get("merges", []) + + # Set tokenizer model based on pre-tokenizer type + if pre_tokenizer_type == "llama3": + self.writer.add_tokenizer_model("gpt2") + self.writer.add_tokenizer_pre("llama3") + else: + self.writer.add_tokenizer_model("gpt2") + + # Create token list with scores + tokens = [] + scores = [] + toktypes = [] + + # Add vocabulary tokens + for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)): + tokens.append(token_str) + scores.append(0.0) # BPE doesn't use scores + + # Determine token type + is_added = any(t.get("content") == token_str for t in added_tokens) + if is_added: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # Add to writer + self.writer.add_token_list(tokens) + self.writer.add_token_scores(scores) + self.writer.add_token_types(toktypes) + + # Add merges + if merges: + self.writer.add_token_merges(merges) + + logger.info(f"Added BPE tokenizer: {len(tokens)} tokens, {len(merges)} merges") + + def _add_unigram_tokenizer( + self, + model_data: dict[str, Any], + added_tokens: list[dict[str, Any]], + ) -> None: + """Add Unigram tokenizer to GGUF file.""" + vocab = model_data.get("vocab", []) + + self.writer.add_tokenizer_model("unigram") + + # Create token list with scores + tokens = [] + scores = [] + toktypes = [] + + # Add vocabulary tokens + for token_data in vocab: + if isinstance(token_data, list) and len(token_data) >= 2: + token_str, score = token_data[0], token_data[1] + else: + continue + + tokens.append(token_str) + scores.append(float(score)) + + # Determine token type + is_added = any(t.get("content") == token_str for t in added_tokens) + if is_added: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # Add to writer + self.writer.add_token_list(tokens) + self.writer.add_token_scores(scores) + self.writer.add_token_types(toktypes) + + logger.info(f"Added Unigram tokenizer: {len(tokens)} tokens") + + def _add_wordpiece_tokenizer( + self, + model_data: dict[str, Any], + added_tokens: list[dict[str, Any]], + ) -> None: + """Add WordPiece tokenizer to GGUF file.""" + vocab = model_data.get("vocab", {}) + + self.writer.add_tokenizer_model("bert") + + # Create token list + tokens = [] + scores = [] + toktypes = [] + + # Add vocabulary tokens + for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)): + tokens.append(token_str) + scores.append(0.0) # WordPiece doesn't use scores + + # Determine token type + is_added = any(t.get("content") == token_str for t in added_tokens) + if is_added: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + + # Add to writer + self.writer.add_token_list(tokens) + self.writer.add_token_scores(scores) + self.writer.add_token_types(toktypes) + + logger.info(f"Added WordPiece tokenizer: {len(tokens)} tokens") + + def _add_generic_tokenizer( + self, + model_data: dict[str, Any], + tokenizer_data: dict[str, Any], + ) -> None: + """Add generic tokenizer as fallback.""" + logger.warning("Using generic tokenizer fallback") + + # Try to extract vocabulary from various possible locations + vocab = model_data.get("vocab", tokenizer_data.get("vocab", {})) + + if not vocab: + logger.error("No vocabulary found in tokenizer") + return + + self.writer.add_tokenizer_model("gpt2") # Default to GPT-2 style + + # Create basic token list + tokens = [] + scores = [] + toktypes = [] + + if isinstance(vocab, dict): + # Dict-style vocab + for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)): + tokens.append(token_str) + scores.append(0.0) + toktypes.append(gguf.TokenType.NORMAL) + elif isinstance(vocab, list): + # List-style vocab + for item in vocab: + if isinstance(item, str): + tokens.append(item) + scores.append(0.0) + toktypes.append(gguf.TokenType.NORMAL) + elif isinstance(item, list) and len(item) >= 1: + tokens.append(str(item[0])) + scores.append(float(item[1]) if len(item) > 1 else 0.0) + toktypes.append(gguf.TokenType.NORMAL) + + if tokens: + self.writer.add_token_list(tokens) + self.writer.add_token_scores(scores) + self.writer.add_token_types(toktypes) + logger.info(f"Added generic tokenizer: {len(tokens)} tokens") + else: + logger.error("Failed to extract tokens from vocabulary") + + def add_tensor(self, name: str, data: np.ndarray) -> None: + """Add tensor to GGUF file. + + Accepts a tensor name following GGUF naming conventions and its + corresponding numpy array data. The tensor is stored for writing + when the file is finalised. + """ + self.writer.add_tensor(name, data) + + def write(self) -> None: + """Finalise and write GGUF file to disk. + + Writes header, key-value data, and tensors to the output file, + completing the GGUF creation process. + """ + logger.info(f"Writing GGUF file to {self.output_path}...") + self.writer.write_header_to_file() + self.writer.write_kv_data_to_file() + self.writer.write_tensors_to_file() + self.writer.close() + logger.info("āœ… GGUF file written successfully") diff --git a/helpers/huggingface/__init__.py b/helpers/huggingface/__init__.py new file mode 100644 index 0000000..ec3bef6 --- /dev/null +++ b/helpers/huggingface/__init__.py @@ -0,0 +1,19 @@ +"""HuggingFace operations and integrations. + +Provides client operations, repository management, and file upload +capabilities for HuggingFace repositories. +""" + +from __future__ import annotations + +from helpers.huggingface.client import HuggingFaceClient +from helpers.huggingface.repository import RepositoryManager +from helpers.huggingface.uploader import FileUploader +from helpers.huggingface.wrapper import HuggingFaceUploader + +__all__ = [ + "FileUploader", + "HuggingFaceClient", + "HuggingFaceUploader", + "RepositoryManager", +] diff --git a/helpers/huggingface/client.py b/helpers/huggingface/client.py new file mode 100644 index 0000000..365ec01 --- /dev/null +++ b/helpers/huggingface/client.py @@ -0,0 +1,124 @@ +"""HuggingFace API client operations. + +Provides basic HuggingFace API operations including authentication, +model downloads, and user information retrieval. +""" + +from __future__ import annotations + +import subprocess +from typing import TYPE_CHECKING + +from helpers.logger import logger + +if TYPE_CHECKING: + from pathlib import Path + + +class HuggingFaceClient: + """Manages basic HuggingFace API operations. + + Provides methods for authentication verification, model downloads, + and user information retrieval using the HuggingFace CLI. + """ + + @staticmethod + def get_username() -> str: + """Get authenticated HuggingFace username. + + Retrieves the current user's HuggingFace username using the CLI. + Requires prior authentication via `huggingface-cli login`. + + Returns: + HuggingFace username. + + Raises: + RuntimeError: If not authenticated or CLI not available. + """ + try: + result = subprocess.run( + ["huggingface-cli", "whoami"], + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError) as err: + msg = "Please log in to HuggingFace first: huggingface-cli login" + raise RuntimeError(msg) from err + + @staticmethod + def download_model( + model_name: str, + output_dir: Path, + include_pattern: str | None = None, + ) -> None: + """Download model from HuggingFace. + + Downloads a complete model or specific files matching a pattern. + Creates the output directory if it doesn't exist. Supports filtered + downloads for efficient bandwidth usage when only certain files are needed. + The model identifier follows HuggingFace naming conventions (e.g. "meta-llama/Llama-2-7b"). + """ + logger.info(f"Downloading {model_name} to {output_dir}") + + cmd = [ + "huggingface-cli", + "download", + model_name, + "--local-dir", + str(output_dir), + ] + + if include_pattern: + cmd.extend(["--include", include_pattern]) + + subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info("Download complete") + + @staticmethod + def check_authentication() -> bool: + """Check if user is authenticated with HuggingFace. + + Returns: + True if authenticated, False otherwise. + """ + try: + result = subprocess.run( + ["huggingface-cli", "whoami"], + capture_output=True, + text=True, + check=False, + ) + except FileNotFoundError: + logger.error( + "huggingface-cli not found. Please install with: pip install huggingface-hub" + ) + return False + else: + return result.returncode == 0 + + @staticmethod + def get_model_info(model_id: str) -> dict | None: + """Get model information from HuggingFace. + + Retrieves metadata about a model from the HuggingFace Hub using the + CLI interface. Returns the model information as a dictionary if found. + + Returns: + Model information dictionary or None if not found. + """ + try: + # Use huggingface-cli to get model info + result = subprocess.run( + ["huggingface-cli", "model-info", model_id], + capture_output=True, + text=True, + check=True, + ) + except subprocess.CalledProcessError: + logger.warning(f"Could not get info for model: {model_id}") + return None + else: + # Parse the output (this is simplified - actual implementation would parse JSON) + return {"output": result.stdout} diff --git a/helpers/huggingface/repository.py b/helpers/huggingface/repository.py new file mode 100644 index 0000000..d6ea3a9 --- /dev/null +++ b/helpers/huggingface/repository.py @@ -0,0 +1,167 @@ +"""HuggingFace repository management. + +Handles repository creation, configuration, and management operations. +""" + +from __future__ import annotations + +import subprocess +import time + +from helpers.logger import logger + + +class RepositoryManager: + """Manages HuggingFace repository operations. + + Provides methods for creating repositories, checking existence, + and managing repository configuration. + """ + + @staticmethod + def create_repository( + repo_id: str, + private: bool = False, + repo_type: str = "model", + ) -> bool: + """Create a new HuggingFace repository. + + Creates a repository with the specified identifier and settings. Repository + identifiers follow the format "username/repo-name". Supports model, dataset, + and space repository types with configurable visibility. + + Returns: + True if repository was created, False if it already exists. + """ + logger.info(f"Creating repository: {repo_id}") + + cmd = [ + "huggingface-cli", + "repo", + "create", + repo_id, + "--type", + repo_type, + ] + + if private: + cmd.append("--private") + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=False, + ) + + if result.returncode == 0: + logger.info(f"Created repository: {repo_id}") + return True + if "already exists" in result.stderr.lower(): + logger.info(f"Repository already exists: {repo_id}") + return False + logger.error(f"Failed to create repository: {result.stderr}") + except Exception as e: + logger.error(f"Error creating repository: {e}") + + return False + + @staticmethod + def ensure_repository_exists(repo_id: str) -> None: + """Ensure repository exists, creating if necessary. + + Attempts to create the repository if it doesn't exist, then waits + briefly to ensure the repository is ready for operations. + """ + # Try to create the repository + RepositoryManager.create_repository(repo_id) + + # Small delay to ensure repository is ready + time.sleep(2) + + @staticmethod + def check_repository_exists(repo_id: str) -> bool: + """Check if a repository exists. + + Queries the HuggingFace Hub to determine if a repository with the + given identifier exists and is accessible. + + Returns: + True if repository exists, False otherwise. + """ + try: + result = subprocess.run( + ["huggingface-cli", "repo", "ls-files", repo_id], + capture_output=True, + text=True, + check=False, + ) + except Exception: + return False + else: + return result.returncode == 0 + + @staticmethod + def delete_repository(repo_id: str) -> bool: + """Delete a HuggingFace repository. + + Permanently removes a repository from the HuggingFace Hub. This operation + cannot be undone and requires appropriate permissions. + + Returns: + True if deleted successfully, False otherwise. + """ + logger.warning(f"Deleting repository: {repo_id}") + + try: + result = subprocess.run( + ["huggingface-cli", "repo", "delete", repo_id, "--yes"], + capture_output=True, + text=True, + check=False, + ) + + if result.returncode == 0: + logger.info(f"Deleted repository: {repo_id}") + return True + logger.error(f"Failed to delete repository: {result.stderr}") + except Exception as e: + logger.error(f"Error deleting repository: {e}") + return False + else: + return False + + @staticmethod + def get_repository_url(repo_id: str) -> str: + """Get the full URL for a repository. + + Constructs the complete HuggingFace Hub URL for accessing the repository + through a web browser. + + Returns: + Full HuggingFace URL for the repository. + """ + return f"https://huggingface.co/{repo_id}" + + @staticmethod + def set_repository_visibility(repo_id: str, private: bool) -> bool: + """Set repository visibility (public/private). + + Changes the visibility setting of an existing repository. Private repositories + require appropriate permissions and may have usage limitations. + + Returns: + True if visibility changed successfully. + """ + visibility = "private" if private else "public" + logger.info(f"Setting {repo_id} visibility to {visibility}") + + try: + # Note: This would require using the HuggingFace API directly + # as the CLI doesn't support changing visibility + logger.warning("Changing repository visibility requires API access") + except Exception as e: + logger.error(f"Error changing visibility: {e}") + + return False diff --git a/helpers/huggingface/uploader.py b/helpers/huggingface/uploader.py new file mode 100644 index 0000000..7984206 --- /dev/null +++ b/helpers/huggingface/uploader.py @@ -0,0 +1,330 @@ +"""HuggingFace file upload operations. + +Handles uploading files to HuggingFace repositories with retry logic +and error handling. +""" + +from __future__ import annotations + +import shutil +import subprocess +import tempfile +import time +from pathlib import Path + +from helpers.huggingface.repository import RepositoryManager +from helpers.logger import logger + + +class FileUploader: + """Manages file uploads to HuggingFace repositories. + + Provides methods for uploading models, READMEs, and other files + with proper error handling, retry logic, and git-based fallbacks. + """ + + @staticmethod + def upload_file( + repo_id: str, + local_path: Path, + repo_path: str | None = None, + create_repo: bool = False, + ) -> None: + """Upload a file to HuggingFace repository. + + Uploads a single file to the specified repository path. Can create + the repository if it doesn't exist. Uses git directly when possible + to avoid automatic PR creation. Repository identifiers follow the format + "username/repo-name". Files are uploaded to the main branch by default. + + Raises: + CalledProcessError: If upload fails. + """ + repo_path = repo_path or local_path.name + logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}") + + # Try git-based upload first to avoid PR creation + if FileUploader._try_git_upload(repo_id, local_path, repo_path, create_repo=create_repo): + logger.info(f"Uploaded {repo_path} via git") + return + + # Fallback to huggingface-cli + logger.info("Git upload failed, trying huggingface-cli...") + cmd = [ + "huggingface-cli", + "upload", + repo_id, + str(local_path), + repo_path, + "--revision", + "main", # Explicitly push to main branch + "--commit-message", + f"Add {repo_path}", + ] + + if create_repo: + cmd.append("--create") + + try: + subprocess.run(cmd, check=True, capture_output=True) + logger.info(f"Uploaded {repo_path}") + except subprocess.CalledProcessError: + if create_repo: + # Repository might already exist, retry without --create + cmd = cmd[:-1] # Remove --create flag + subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info(f"Updated {repo_path}") + else: + raise + + @staticmethod + def _try_git_upload( + repo_id: str, + local_path: Path, + repo_path: str, + *, + create_repo: bool = False, + ) -> bool: + """Try to upload file using git directly to avoid PR creation. + + Returns: + bool: True if upload successful, False if should fallback to CLI. + """ + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + repo_url = f"https://huggingface.co/{repo_id}" + + # Clone repository + logger.info(f"Cloning {repo_url}...") + result = subprocess.run( + ["git", "clone", repo_url, str(temp_path / "repo")], + check=False, + capture_output=True, + text=True, + ) + + if result.returncode != 0: + if create_repo: + # Repository doesn't exist, let huggingface-cli handle creation + return False + logger.warning(f"Clone failed: {result.stderr}") + return False + + repo_dir = temp_path / "repo" + target_file = repo_dir / repo_path + + # Ensure target directory exists + target_file.parent.mkdir(parents=True, exist_ok=True) + + # Copy file + shutil.copy2(local_path, target_file) + + # Check if there are any changes + status_result = subprocess.run( + ["git", "status", "--porcelain"], + cwd=repo_dir, + capture_output=True, + text=True, + check=True, + ) + + if not status_result.stdout.strip(): + logger.info(f"No changes detected for {repo_path}, file already up-to-date") + return True # File is already up-to-date, no need to push + + # Git add, commit, push + subprocess.run( + ["git", "add", repo_path], + cwd=repo_dir, + check=True, + capture_output=True, + text=True, + ) + subprocess.run( + ["git", "commit", "-m", f"Update {repo_path}"], + cwd=repo_dir, + check=True, + capture_output=True, + text=True, + ) + subprocess.run( + ["git", "push"], + cwd=repo_dir, + check=True, + capture_output=True, + text=True, + ) + + return True + + except subprocess.CalledProcessError as e: + logger.warning(f"Git upload failed: {e}") + return False + except Exception as e: + logger.warning(f"Git upload error: {e}") + return False + + @staticmethod + def upload_readme( + repo_id: str, + readme_path: Path, + ensure_repo: bool = True, + ) -> None: + """Upload or update README file to repository. + + Creates repository if needed, handles existing repository updates. + The README is uploaded as README.md in the repository root and will + replace any existing README file. + + Raises: + RuntimeError: If the README upload fails. + """ + logger.info("Uploading README...") + + # Add delay to prevent rate limiting + time.sleep(2) + + # First ensure the repository exists if requested + if ensure_repo: + RepositoryManager.ensure_repository_exists(repo_id) + + # Upload without --create flag to avoid PR creation + try: + logger.debug(f"DEBUG: Uploading README to {repo_id}") + subprocess.run( + [ + "huggingface-cli", + "upload", + repo_id, + str(readme_path), + "README.md", + "--commit-message", + "Update README.md", + ], + check=True, + capture_output=True, + text=True, + ) + logger.info("README uploaded successfully") + except subprocess.CalledProcessError as e: + # Retry with delay in case of rate limiting + if "429" in str(e.stderr): + logger.warning("Rate limited, waiting 30 seconds...") + time.sleep(30) + subprocess.run( + [ + "huggingface-cli", + "upload", + repo_id, + str(readme_path), + "README.md", + "--commit-message", + "Update README.md", + ], + check=True, + capture_output=True, + text=True, + ) + logger.info("README uploaded successfully (after retry)") + else: + msg = f"Failed to upload README: {e.stderr}" + raise RuntimeError(msg) from e + + @staticmethod + def upload_model_file( + repo_id: str, + model_path: Path, + repo_filename: str | None = None, + ) -> None: + """Upload a model file to repository. + + Optimised for large model file uploads with progress tracking. + The model file is uploaded to the repository root by default or + to the specified filename if provided. + + Raises: + subprocess.CalledProcessError: If the upload fails. + """ + repo_filename = repo_filename or model_path.name + logger.info( + f"Uploading model file {model_path.name} " + f"({model_path.stat().st_size / (1024**3):.1f}GB)..." + ) + + cmd = [ + "huggingface-cli", + "upload", + repo_id, + str(model_path), + repo_filename, + "--commit-message", + f"Add {repo_filename}", + ] + + try: + # Run with output streaming for large files + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + ) + + # Stream output + if process.stdout: + for line in iter(process.stdout.readline, ""): + if line and "upload" in line.lower(): + logger.debug(line.strip()) + + process.wait() + + if process.returncode != 0: + raise subprocess.CalledProcessError(process.returncode, cmd) + + logger.info(f"Successfully uploaded {repo_filename}") + + except subprocess.CalledProcessError as e: + logger.error(f"Failed to upload model file: {e}") + raise + + @staticmethod + def upload_folder( + repo_id: str, + folder_path: Path, + path_in_repo: str = ".", + ignore_patterns: list[str] | None = None, + ) -> None: + """Upload an entire folder to repository. + + Recursively uploads all files from a local folder to the repository, + preserving the directory structure. Supports ignore patterns for + selective uploads. + + Raises: + subprocess.CalledProcessError: If the upload fails. + """ + logger.info(f"Uploading folder {folder_path} to {repo_id}/{path_in_repo}") + + cmd = [ + "huggingface-cli", + "upload", + repo_id, + str(folder_path), + path_in_repo, + "--commit-message", + f"Upload {folder_path.name}", + ] + + if ignore_patterns: + for pattern in ignore_patterns: + cmd.extend(["--exclude", pattern]) + + try: + subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info(f"Successfully uploaded folder {folder_path.name}") + except subprocess.CalledProcessError as e: + logger.error(f"Failed to upload folder: {e}") + raise diff --git a/helpers/huggingface/wrapper.py b/helpers/huggingface/wrapper.py new file mode 100644 index 0000000..3c9d47e --- /dev/null +++ b/helpers/huggingface/wrapper.py @@ -0,0 +1,57 @@ +"""Compatibility wrapper for HuggingFace operations. + +Provides a compatible interface matching the old HuggingFaceUploader +class for backward compatibility during refactoring. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from helpers.huggingface.client import HuggingFaceClient +from helpers.huggingface.repository import RepositoryManager +from helpers.huggingface.uploader import FileUploader + +if TYPE_CHECKING: + from pathlib import Path + + +class HuggingFaceUploader: + """Compatibility wrapper for HuggingFace operations. + + Maintains the same interface as the old HuggingFaceUploader class + while using the new modular components internally. + """ + + @staticmethod + def get_username() -> str: + """Get authenticated HuggingFace username. + + Returns: + HuggingFace username from CLI authentication. + """ + return HuggingFaceClient.get_username() + + def upload_readme(self, output_repo: str, readme_path: Path) -> None: + """Upload or update README file to repository. + + Creates repository if needed, handles existing repository updates. + The README is uploaded to the repository root as README.md. + """ + FileUploader.upload_readme(output_repo, readme_path, ensure_repo=True) + + def upload_model_file(self, output_repo: str, model_path: Path) -> None: + """Upload model file to repository. + + Uploads GGUF model file to specified repository path. The file + is uploaded with progress tracking suitable for large model files. + """ + FileUploader.upload_model_file(output_repo, model_path) + + def _ensure_repo_exists(self, repo_id: str) -> None: + """Ensure the repository exists, creating it if necessary. + + Creates the repository if it doesn't exist and waits briefly + to ensure it's ready for subsequent operations. + """ + RepositoryManager.ensure_repository_exists(repo_id) diff --git a/helpers/llama_cpp/__init__.py b/helpers/llama_cpp/__init__.py new file mode 100644 index 0000000..1b33c8b --- /dev/null +++ b/helpers/llama_cpp/__init__.py @@ -0,0 +1,20 @@ +"""llama.cpp operations and binary management. + +Provides interfaces to llama.cpp binaries for quantisation and +importance matrix generation. +""" + +from __future__ import annotations + +from helpers.llama_cpp.architecture import ArchitectureDetector +from helpers.llama_cpp.binary_manager import BinaryManager +from helpers.llama_cpp.imatrix import IMatrixGenerator, IMatrixHandler +from helpers.llama_cpp.quantiser import QuantisationExecutor + +__all__ = [ + "ArchitectureDetector", + "BinaryManager", + "IMatrixGenerator", + "IMatrixHandler", + "QuantisationExecutor", +] diff --git a/helpers/llama_cpp/architecture.py b/helpers/llama_cpp/architecture.py new file mode 100644 index 0000000..2cd162f --- /dev/null +++ b/helpers/llama_cpp/architecture.py @@ -0,0 +1,235 @@ +"""Architecture detection and support checking. + +Determines whether model architectures are supported by llama.cpp +and provides fallback strategies for unsupported architectures. +""" + +from __future__ import annotations + +import subprocess +from typing import TYPE_CHECKING + +from helpers.logger import logger + +if TYPE_CHECKING: + from pathlib import Path + + +class ArchitectureDetector: + """Detects and validates model architecture support. + + Checks whether model architectures are supported by llama.cpp + for K-quant generation and determines appropriate quantisation + strategies for unsupported architectures. + """ + + @staticmethod + def check_architecture_support(f16_model_path: Path) -> bool: + """Check if the model architecture is supported by llama.cpp. + + Tests the model's compatibility by attempting a quantisation with + llama.cpp. Returns true if the architecture is unsupported, indicating + that K-quants should be skipped. + + Returns: + True if architecture is NOT supported (K-quants should be skipped) + """ + try: + # Try a simple quantization with llama.cpp to check support + result = subprocess.run( + [ + ".cache/llm-gguf-tools/binaries/llama-quantize", + str(f16_model_path), + "/dev/null", + "Q4_K_M", + ], + check=False, + capture_output=True, + text=True, + timeout=5, + ) + + # Check if it failed due to unknown architecture + return bool(result.stderr and "unknown model architecture" in result.stderr.lower()) + except Exception: + # If we can't determine, assume it might work + return False + + @staticmethod + def get_supported_architectures() -> list[str]: + """Get list of architectures known to be supported by llama.cpp. + + Returns: + List of supported architecture names. + """ + return [ + "llama", + "llama2", + "llama3", + "mistral", + "mixtral", + "qwen", + "qwen2", + "gemma", + "gemma2", + "phi", + "phi2", + "phi3", + "falcon", + "gpt2", + "gptj", + "gptneox", + "mpt", + "starcoder", + "starcoder2", + "baichuan", + "bert", + "bloom", + "deepseek", + "deepseek2", + "chatglm", + "orion", + "internlm2", + "minicpm", + "stablelm", + "cohere", + "dbrx", + "olmo", + "arctic", + "rwkv", + ] + + @staticmethod + def map_architecture(model_type: str, arch_name: str) -> str: + """Map model architecture to GGUF architecture string. + + Translates model type and architecture names from HuggingFace config + to GGUF-compatible architecture identifiers. Handles special cases like + "gpt-oss" to "gptoss" conversion and provides fallback mapping. + + Returns: + GGUF architecture string to use. + """ + # Direct mappings from model_type + type_mappings = { + "llama": "llama", + "mistral": "llama", # Mistral uses llama architecture + "mixtral": "llama", + "qwen": "qwen", + "qwen2": "qwen2", + "gemma": "gemma", + "gemma2": "gemma2", + "phi": "phi2", + "phi3": "phi3", + "phi-msft": "phi2", + "falcon": "falcon", + "gpt2": "gpt2", + "gptj": "gptj", + "gpt_neox": "gptneox", + "gpt-oss": "gptoss", + "mpt": "mpt", + "starcoder": "starcoder", + "starcoder2": "starcoder2", + "baichuan": "baichuan", + "bloom": "bloom", + "chatglm": "chatglm", + "deepseek": "llama", # DeepSeek uses llama architecture + "stablelm": "stablelm", + "cohere": "cohere", + "dbrx": "dbrx", + "olmo": "olmo", + "arctic": "arctic", + } + + # Check model_type first + if model_type in type_mappings: + return type_mappings[model_type] + + # Architecture name mappings as fallback + arch_mappings = { + "LlamaForCausalLM": "llama", + "MistralForCausalLM": "llama", + "MixtralForCausalLM": "llama", + "Qwen2ForCausalLM": "qwen2", + "QwenForCausalLM": "qwen", + "GemmaForCausalLM": "gemma", + "Gemma2ForCausalLM": "gemma2", + "GptOssForCausalLM": "gptoss", + "PhiForCausalLM": "phi2", + "Phi3ForCausalLM": "phi3", + "FalconForCausalLM": "falcon", + "GPT2LMHeadModel": "gpt2", + "GPTJForCausalLM": "gptj", + "GPTNeoXForCausalLM": "gptneox", + "MPTForCausalLM": "mpt", + "BloomForCausalLM": "bloom", + "ChatGLMForCausalLM": "chatglm", + "StableLmForCausalLM": "stablelm", + "CohereForCausalLM": "cohere", + } + + if arch_name in arch_mappings: + return arch_mappings[arch_name] + + # Default fallback + logger.warning(f"Unknown architecture: {arch_name} (type: {model_type})") + logger.warning("Defaulting to 'llama' architecture - may not work correctly") + return "llama" + + @staticmethod + def get_quantisation_support(architecture: str) -> dict[str, bool]: + """Determine which quantisation types are supported for an architecture. + + Evaluates architecture compatibility with different quantisation methods. + Basic quantisations are always supported via GGML, while K-quants and + imatrix require specific llama.cpp support. + + Returns: + Dictionary mapping quantisation type categories to support status. + """ + # Known unsupported architectures for K-quants + unsupported_kquants = [ + "bert", + "dotsocr", # Custom/unknown architectures + ] + + is_supported = architecture not in unsupported_kquants + + return { + "basic": True, # Q4_0, Q5_0, Q6_0, Q8_0 always supported via GGML + "k_quants": is_supported, # K-quants require llama.cpp support + "imatrix": is_supported, # imatrix requires llama.cpp support + } + + @staticmethod + def filter_quantisation_types( + architecture: str, + requested_types: list[str], + ) -> tuple[list[str], list[str]]: + """Filter quantisation types based on architecture support. + + Separates requested quantisation types into supported and unsupported + based on the model's architecture capabilities. Basic types are always + supported, while K-quants depend on architecture compatibility. + + Returns: + Tuple of (supported_types, skipped_types). + """ + support = ArchitectureDetector.get_quantisation_support(architecture) + basic_types = {"Q4_0", "Q5_0", "Q6_0", "Q8_0"} + + supported = [] + skipped = [] + + for quant_type in requested_types: + if quant_type in basic_types: + # Basic types always supported + supported.append(quant_type) + elif support["k_quants"]: + # K-quants supported for this architecture + supported.append(quant_type) + else: + # K-quants not supported + skipped.append(quant_type) + + return supported, skipped diff --git a/helpers/services/binary_manager.py b/helpers/llama_cpp/binary_manager.py similarity index 92% rename from helpers/services/binary_manager.py rename to helpers/llama_cpp/binary_manager.py index f41f58a..cc037ff 100644 --- a/helpers/services/binary_manager.py +++ b/helpers/llama_cpp/binary_manager.py @@ -54,8 +54,9 @@ class BinaryManager: def _get_binary_path(self, base_name: str) -> Path: """Get path to binary. - Args: - base_name: Base name of binary (without extension). + Constructs the full path to a binary executable based on the base + name, automatically adding the appropriate file extension for the + current operating system platform. Returns: Path where binary should be located. @@ -82,9 +83,9 @@ class BinaryManager: def _get_binary(self, name: str, binary_path: Path) -> Path | None: """Get a specific binary, downloading if necessary. - Args: - name: Name of the binary. - binary_path: Path where binary should be located. + Checks for existing binaries and downloads the latest release if + updates are needed. Falls back to existing binaries if download + fails, ensuring robust binary availability for quantisation tasks. Returns: Path to binary if available, None if download fails. @@ -275,8 +276,9 @@ class BinaryManager: def _download_and_extract(self, url: str) -> bool: """Download and extract binary archive. - Args: - url: Download URL for archive. + Downloads the binary archive from the specified URL and extracts + the necessary binaries and shared libraries. Handles both ZIP and + TAR.GZ formats with appropriate platform-specific permissions. Returns: True if successful, False otherwise. @@ -401,10 +403,9 @@ class BinaryManager: ) -> None: """Extract shared libraries needed by the binaries. - Args: - archive: The archive object. - members: List of all archive members. - lib_patterns: Patterns to match for library files. + Searches through archive members to find shared libraries matching + the specified patterns and extracts them to ensure proper binary + functionality. Sets appropriate permissions on Unix systems. """ for member in members: base_name = Path(member).name @@ -437,8 +438,9 @@ class BinaryManager: def _save_version_info(self, release_info: dict[str, Any]) -> None: """Save version information to cache. - Args: - release_info: GitHub release information. + Stores release version, timestamp, and URL information to the local + cache to enable version checking and update determination for + future binary manager operations. """ version_data = { "version": release_info.get("tag_name", "unknown"), @@ -454,8 +456,9 @@ class BinaryManager: def check_binary_works(self, binary_path: Path | None = None) -> bool: """Check if the binary actually works. - Args: - binary_path: Path to binary to check. If None, checks quantize binary. + Validates that the specified binary can execute properly by running + a help command with appropriate environment variables set for shared + library loading. Defaults to checking the quantise binary if no path provided. Returns: True if binary executes successfully, False otherwise. diff --git a/helpers/services/imatrix_generator.py b/helpers/llama_cpp/imatrix.py similarity index 76% rename from helpers/services/imatrix_generator.py rename to helpers/llama_cpp/imatrix.py index c6139bc..ae5ad66 100644 --- a/helpers/services/imatrix_generator.py +++ b/helpers/llama_cpp/imatrix.py @@ -1,7 +1,7 @@ -"""Importance matrix generation service. +"""Importance matrix operations for llama.cpp. -Generates importance matrices using llama-imatrix binary with calibration -data for improved quantisation quality. +Handles importance matrix generation and management for improved +quantisation quality. """ from __future__ import annotations @@ -12,13 +12,78 @@ import subprocess from pathlib import Path from typing import TYPE_CHECKING +from helpers.filesystem import FilesystemService +from helpers.llama_cpp.binary_manager import BinaryManager from helpers.logger import logger -from helpers.services.binary_manager import BinaryManager if TYPE_CHECKING: from helpers.models.quantisation import ModelSource +class IMatrixHandler: + """Handles importance matrix file management. + + Manages detection and use of existing importance matrix files for + quantisation guidance. + """ + + def __init__(self) -> None: + """Initialise IMatrixHandler.""" + self.fs = FilesystemService() + + def find_imatrix(self, model_dir: Path) -> Path | None: + """Find existing imatrix file in model directory. + + Returns: + Path to imatrix file if found, None otherwise. + """ + imatrix_path = model_dir / "imatrix.dat" + + if imatrix_path.exists(): + file_size = self.fs.get_file_size(imatrix_path) + logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})") + return imatrix_path + + return None + + def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None: + """Prompt user for existing imatrix file. + + Returns: + Path to user-provided imatrix, or None if not available. + """ + imatrix_path = model_dir / "imatrix.dat" + + logger.info(f"Model directory: {model_dir}") + logger.info(f"Looking for imatrix file at: {imatrix_path}") + logger.info( + "Tip: You can download pre-computed imatrix files from Bartowski's repositories!" + ) + logger.info( + " Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix" + ) + + response = ( + input("\nā“ Do you have an imatrix file to place in the model directory? (y/N): ") + .strip() + .lower() + ) + + if response != "y": + return None + + logger.info(f"Please place your imatrix.dat file in: {model_dir}") + input("ā³ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...") + + if imatrix_path.exists(): + file_size = self.fs.get_file_size(imatrix_path) + logger.info(f"Found imatrix file! ({file_size})") + return imatrix_path + + logger.warning("No imatrix.dat file found - continuing without imatrix") + return None + + class IMatrixGenerator: """Generates importance matrices for quantisation guidance. @@ -218,10 +283,9 @@ class IMatrixGenerator: ) -> Path | None: """Prompt user to generate imatrix. - Args: - model_source: Model source information. - model_dir: Model directory. - f16_model_path: Path to F16 model. + Interactively prompts the user to generate an importance matrix + for enhanced quantisation quality using the model source information, + directory, and F16 model path. Checks binary availability before prompting. Returns: Path to generated imatrix or None if skipped. diff --git a/helpers/llama_cpp/quantiser.py b/helpers/llama_cpp/quantiser.py new file mode 100644 index 0000000..48c2131 --- /dev/null +++ b/helpers/llama_cpp/quantiser.py @@ -0,0 +1,219 @@ +"""Direct llama.cpp quantisation execution. + +Provides direct execution of llama.cpp quantisation binary with proper +tensor-specific override support for L and XL variants. +""" + +from __future__ import annotations + +import os +import platform +import subprocess +from pathlib import Path +from typing import TYPE_CHECKING + +from helpers.filesystem import FilesystemService +from helpers.llama_cpp.binary_manager import BinaryManager +from helpers.logger import logger + +if TYPE_CHECKING: + from helpers.models.quantisation import QuantisationConfig + + +class QuantisationExecutor: + """Executes llama.cpp quantisation with tensor overrides. + + Provides direct binary execution with proper command-line flags for + tensor-specific overrides, supporting Bartowski-style L and XL variants. + """ + + def __init__(self) -> None: + """Initialise quantisation executor.""" + self.fs = FilesystemService() + self.binary_manager = BinaryManager() + self.quantise_binary = self._get_quantise_binary() + self.last_error: str | None = None # Track last error type + + def _get_quantise_binary(self) -> Path | None: + """Get llama-quantize binary, downloading if necessary. + + Returns: + Path to binary if found, None otherwise. + """ + # First check local directory for manual placement + local_binary = Path("./llama-quantize") + if local_binary.exists(): + logger.info(f"Using local llama-quantize binary: {local_binary}") + return local_binary + + # Download from GitHub releases + binary_path = self.binary_manager.get_quantise_binary() + if binary_path and self.binary_manager.check_binary_works(binary_path): + logger.info(f"Using llama-quantize binary: {binary_path}") + return binary_path + + logger.error("Failed to obtain llama-quantize binary") + logger.info( + "You can manually download it from: https://github.com/ggml-org/llama.cpp/releases" + ) + return None + + def execute_quantisation( + self, + input_path: Path, + output_path: Path, + config: QuantisationConfig, + imatrix_path: Path | None = None, + ) -> bool: + """Execute quantisation using llama.cpp binary. + + Builds and executes llama-quantize command with proper tensor override + flags for L and XL variants. + + Returns: + True if quantisation successful, False otherwise. + """ + if not self.quantise_binary: + logger.error("llama-quantize binary not available") + return False + + # Build command + cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path) + + # Execute with real-time output + return self._execute_command(cmd) + + def _build_quantisation_command( + self, + input_path: Path, + output_path: Path, + config: QuantisationConfig, + imatrix_path: Path | None, + ) -> list[str]: + """Build llama-quantize command with tensor overrides. + + Returns: + Command arguments as list. + """ + cmd = [str(self.quantise_binary)] + + # Add imatrix if available + if imatrix_path: + cmd.extend(["--imatrix", str(imatrix_path)]) + + # Add tensor overrides for L and XL variants + if config.output_type: + cmd.extend(["--output-tensor-type", config.output_type]) + if config.embedding_type: + cmd.extend(["--token-embedding-type", config.embedding_type]) + + # Add input, output, and quantisation type + cmd.extend([str(input_path), str(output_path), config.base_type]) + + return cmd + + def _setup_environment(self) -> dict[str, str]: + """Set up environment variables for quantisation command. + + Returns: + Environment dictionary with necessary library paths. + """ + env = os.environ.copy() + if platform.system() != "Windows": + lib_path = str(self.binary_manager.BINARY_DIR) + if "LD_LIBRARY_PATH" in env: + env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}" + else: + env["LD_LIBRARY_PATH"] = lib_path + return env + + def _process_output_stream(self, process: subprocess.Popen) -> tuple[list[str], bool]: + """Process subprocess output stream and detect errors. + + Returns: + Tuple of (output_lines, architecture_error_detected). + """ + output_lines = [] + architecture_error = False + + if process.stdout: + for line in iter(process.stdout.readline, ""): + if line: + cleaned_line = line.rstrip() + output_lines.append(cleaned_line) + logger.info(f" {cleaned_line}") + + # Check for architecture errors + if any( + error_text in cleaned_line.lower() + for error_text in [ + "unknown model architecture", + "unsupported architecture", + "unknown architecture", + "architecture not supported", + "model architecture", + "llama_model_load: error loading model", + ] + ): + architecture_error = True + + return output_lines, architecture_error + + def _handle_architecture_error(self, output_lines: list[str]) -> bool: + """Handle architecture-related errors by checking output. + + Returns: + True if architecture error was detected and handled. + """ + # Look for architecture info in recent output + for line in output_lines[-10:]: # Check last 10 lines + if "architecture" in line.lower(): + logger.error("āŒ Architecture not supported by llama.cpp") + logger.error(" so cannot be quantised with current llama.cpp but") + logger.error(" F16 GGUF file can be used for inference if supported") + # Store this for the orchestrator to detect + self.last_error = "unsupported_architecture" + return True + return False + + def _execute_command(self, cmd: list[str]) -> bool: + """Execute command with real-time output streaming. + + Returns: + True if successful, False otherwise. + """ + try: + logger.info(f"šŸ”§ Executing: {' '.join(cmd)}") + + env = self._setup_environment() + + # Execute with real-time output + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + env=env, + ) + + output_lines, architecture_error = self._process_output_stream(process) + + return_code = process.poll() + if return_code == 0: + logger.info("āœ… Quantisation successful!") + return True + + # Check if this was an architecture error + if (architecture_error or return_code == 1) and self._handle_architecture_error( + output_lines + ): + return False + + logger.error(f"āŒ Quantisation failed with return code {return_code}") + + except Exception as e: + logger.error(f"āŒ Quantisation failed with exception: {e}") + + return False diff --git a/helpers/quantisation/__init__.py b/helpers/quantisation/__init__.py new file mode 100644 index 0000000..8ea0411 --- /dev/null +++ b/helpers/quantisation/__init__.py @@ -0,0 +1,23 @@ +"""Quantisation orchestration and workflow management. + +Provides high-level orchestration of the quantisation workflow, +including execution, progress tracking, and profile management. +""" + +from __future__ import annotations + +from helpers.quantisation.engine import QuantisationEngine +from helpers.quantisation.executor import QuantisationExecutor +from helpers.quantisation.model_manager import ModelManager +from helpers.quantisation.orchestrator import QuantisationOrchestrator +from helpers.quantisation.profile_manager import ProfileManager +from helpers.quantisation.progress import ProgressReporter + +__all__ = [ + "ModelManager", + "ProfileManager", + "ProgressReporter", + "QuantisationEngine", + "QuantisationExecutor", + "QuantisationOrchestrator", +] diff --git a/helpers/quantisation/engine.py b/helpers/quantisation/engine.py new file mode 100644 index 0000000..583825c --- /dev/null +++ b/helpers/quantisation/engine.py @@ -0,0 +1,141 @@ +"""Quantisation engine for model processing. + +Handles the actual quantisation process with configurable methods, +supporting multiple quantisation backends and fallback strategies. +""" + +from __future__ import annotations + +import traceback +from typing import TYPE_CHECKING + +from helpers.filesystem import FilesystemService +from helpers.ggml import GGMLQuantiser +from helpers.llama_cpp import QuantisationExecutor +from helpers.logger import logger +from helpers.models.quantisation import QuantisationResult, QuantisationType + +if TYPE_CHECKING: + from pathlib import Path + + from helpers.models.quantisation import ( + QuantisationContext, + ) + + +class QuantisationEngine: + """Handles the actual quantisation process with configurable methods. + + Provides flexible quantisation execution supporting multiple tensor + precision configurations, importance matrices, and fallback strategies. + Uses direct llama.cpp binary execution with proper tensor overrides. + """ + + def __init__(self) -> None: + """Initialise quantisation engine.""" + self.fs = FilesystemService() + self.executor = QuantisationExecutor() + self.ggml_quantiser = GGMLQuantiser() + + def quantise(self, context: QuantisationContext) -> QuantisationResult: + """Perform quantisation using the specified configuration. + + Executes quantisation using direct llama.cpp binary with proper + tensor override flags for L and XL variants. Falls back to GGML + for basic types when architecture is unsupported. Processes the + quantisation context containing all required parameters and settings. + + Returns: + QuantisationResult with success status and file information. + """ + logger.info( + f"āš™ļø Creating {context.config.name} quantisation ({context.config.description})..." + ) + + output_path = context.get_output_path() + + # Check input file exists and is readable + if not context.f16_model_path.exists(): + error_msg = f"Input model file does not exist: {context.f16_model_path}" + logger.error(f"āŒ {error_msg}") + return QuantisationResult( + quantisation_type=QuantisationType(context.config.name), + success=False, + error_message=error_msg, + ) + + logger.info(f"šŸŽÆ Attempting {context.config.name} quantisation...") + logger.info(f"šŸ“ Source: {context.f16_model_path}") + logger.info(f"šŸ“ Target: {output_path}") + + # Determine if this is a basic type that can use GGML + basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + is_basic_type = context.config.name in basic_types + + try: + # Try llama.cpp first for all types + logger.info("šŸ”§ Using llama.cpp binary for quantisation...") + + success = self.executor.execute_quantisation( + context.f16_model_path, output_path, context.config, context.imatrix_path + ) + + if success: + return self._create_success_result(context.config.name, output_path, "llama.cpp") + + # Check if this was an architecture error and we can use GGML fallback + if ( + hasattr(self.executor, "last_error") + and self.executor.last_error == "unsupported_architecture" + and is_basic_type + ): + logger.info("šŸ”„ Architecture unsupported - using GGML implementation...") + + success = self.ggml_quantiser.try_alternative_quantisation( + context.f16_model_path, output_path, context.config.name + ) + + if success: + return self._create_success_result( + context.config.name, output_path, "GGML numpy" + ) + + logger.error(f"āŒ {context.config.name} quantisation failed") + return QuantisationResult( + quantisation_type=QuantisationType(context.config.name), + success=False, + error_message="Quantisation failed via Python API", + ) + + except Exception as e: + logger.error(f"āŒ Exception during {context.config.name} quantisation: {e}") + logger.error("Exception traceback:") + for line in traceback.format_exc().splitlines(): + logger.error(f" {line}") + + return QuantisationResult( + quantisation_type=QuantisationType(context.config.name), + success=False, + error_message=f"Exception during quantisation: {e!s}", + ) + + def _create_success_result( + self, quant_type: str, output_path: Path, method_used: str + ) -> QuantisationResult: + """Create successful quantisation result with file metadata. + + Constructs a successful quantisation result containing file size + information and method details. Uses the quantisation type, output + path, and method information to generate comprehensive result metadata. + + Returns: + QuantisationResult with file path and size information. + """ + file_size = self.fs.get_file_size(output_path) + return QuantisationResult( + quantisation_type=QuantisationType(quant_type), + success=True, + file_path=output_path, + file_size=file_size, + method_used=method_used, + ) diff --git a/helpers/quantisation/executor.py b/helpers/quantisation/executor.py new file mode 100644 index 0000000..f924347 --- /dev/null +++ b/helpers/quantisation/executor.py @@ -0,0 +1,457 @@ +"""Quantisation execution management. + +Handles the execution of quantisation operations including parallel +uploads, status tracking, and error handling. +""" + +from __future__ import annotations + +import gc +import traceback +from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING, Any + +from helpers.config.quantisation_configs import QUANTISATION_CONFIGS +from helpers.logger import logger +from helpers.models.quantisation import ( + QuantisationContext, + QuantisationResult, + QuantisationType, +) +from helpers.quantisation.progress import ProgressReporter +from helpers.utils.rate_limiter import ReadmeRateLimiter + +if TYPE_CHECKING: + from pathlib import Path + + from helpers.filesystem import FileCleanup + from helpers.huggingface import HuggingFaceUploader + from helpers.models.quantisation import ModelSource + from helpers.quantisation.engine import QuantisationEngine + from helpers.readme import ReadmeGenerator + + +class QuantisationExecutor: + """Executes quantisation operations with parallel upload support. + + Manages the execution of multiple quantisations with background + uploads, status tracking, and proper error handling. + """ + + def __init__( + self, + quantisation_engine: QuantisationEngine, + uploader: HuggingFaceUploader, + readme_generator: ReadmeGenerator, + file_cleanup: FileCleanup, + no_upload: bool = False, + ) -> None: + """Initialise quantisation executor. + + Sets up the quantisation executor with all required service dependencies + for performing quantisations, uploading results, generating documentation, + and cleaning up temporary files. Configures upload behaviour based on settings. + """ + self.quantisation_engine = quantisation_engine + self.uploader = uploader + self.readme_generator = readme_generator + self.file_cleanup = file_cleanup + self.no_upload = no_upload + self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0) + self.progress_reporter = ProgressReporter() + + def execute_quantisations( + self, + model_source: ModelSource, + f16_model_path: Path, + imatrix_path: Path | None, + output_repo: str, + quantisation_types: list[QuantisationType], + models_dir: Path, + ) -> dict[QuantisationType, QuantisationResult]: + """Execute all quantisation types with parallel uploads. + + Orchestrates the complete quantisation workflow including F16 processing, + multiple quantisation type execution, parallel upload management, and + README generation. Handles all aspects of the quantisation pipeline + from initial setup through final documentation. + + Returns: + Dictionary of quantisation results by type. + """ + results: dict[QuantisationType, QuantisationResult] = {} + + # Track F16 in results if we converted from SafeTensors + if not model_source.is_gguf_repo: + results[QuantisationType.F16] = self._create_f16_result(f16_model_path) + + # Process with parallel uploads + upload_futures: list[Any] = [] + + with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor: + # Start F16 upload if applicable + if ( + not model_source.is_gguf_repo + and not self.no_upload + and QuantisationType.F16 in results + ): + self._start_f16_upload( + results, + model_source, + output_repo, + f16_model_path, + upload_executor, + upload_futures, + ) + + # Process each quantisation + for i, quant_type in enumerate(quantisation_types, 1): + # Skip if already marked as failed + if quant_type in results and results[quant_type].status == "failed": + logger.info( + f"Skipping {quant_type.value} - {results[quant_type].error_message}" + ) + continue + + self.progress_reporter.print_quantisation_start( + i, len(quantisation_types), quant_type.value + ) + + try: + result = self._process_single_quantisation( + quant_type, + model_source, + f16_model_path, + imatrix_path, + output_repo, + results, + models_dir, + upload_executor, + upload_futures, + ) + results[quant_type] = result + + # Force cleanup between quantisations + gc.collect() + + except Exception as e: + logger.error(f"āŒ Critical error processing {quant_type.value}: {e}") + logger.error("Exception traceback:") + for line in traceback.format_exc().splitlines(): + logger.error(f" {line}") + + results[quant_type] = QuantisationResult( + quantisation_type=quant_type, + success=False, + status="failed", + error_message=str(e), + ) + + # Force cleanup after error + gc.collect() + + # Wait for all uploads to complete + self._wait_for_uploads(upload_futures) + + # Final README update + if not self.no_upload and upload_futures: + self._final_readme_update(model_source, results, models_dir, output_repo) + + return results + + def _process_single_quantisation( + self, + quant_type: QuantisationType, + model_source: ModelSource, + f16_model_path: Path, + imatrix_path: Path | None, + output_repo: str, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + upload_executor: ThreadPoolExecutor, + upload_futures: list, + ) -> QuantisationResult: + """Process a single quantisation type. + + Returns: + QuantisationResult for the processed type. + """ + try: + logger.info(f"Starting {quant_type.value} quantisation...") + config = QUANTISATION_CONFIGS[quant_type] + + # Create initial result and update status + result = QuantisationResult(quantisation_type=quant_type, success=False) + result.status = "processing" + results[quant_type] = result + + self._update_readme_status(model_source, results, models_dir, output_repo) + + # Perform quantisation + context = QuantisationContext( + f16_model_path=f16_model_path, + model_source=model_source, + config=config, + models_dir=models_dir, + imatrix_path=imatrix_path, + ) + result = self.quantisation_engine.quantise(context) + + # Handle result + if result.success and result.file_path: + self._start_parallel_upload( + result, + quant_type, + output_repo, + model_source, + results, + models_dir, + upload_executor, + upload_futures, + ) + else: + result.status = "failed" + self._update_readme_status(model_source, results, models_dir, output_repo) + + except Exception as e: + logger.error(f"Error processing {quant_type.value}: {e}") + result = QuantisationResult(quantisation_type=quant_type, success=False) + result.status = "failed" + result.error_message = str(e) + + try: + self._update_readme_status(model_source, results, models_dir, output_repo) + except Exception as readme_error: + logger.error(f"Failed to update README after error: {readme_error}") + + return result + + def _start_parallel_upload( + self, + result: QuantisationResult, + quant_type: QuantisationType, + output_repo: str, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + upload_executor: ThreadPoolExecutor, + upload_futures: list, + ) -> None: + """Start parallel upload of quantisation result.""" + if self.no_upload or not result.file_path: + return + + quant_str = getattr(result.quantisation_type, "value", result.quantisation_type) + logger.info(f"Starting parallel upload of {quant_str}...") + + upload_future = upload_executor.submit( + self._upload_and_cleanup, + output_repo, + result.file_path, + quant_type, + model_source, + results, + models_dir, + ) + upload_futures.append(upload_future) + + result.file_path = None # Mark as being uploaded + result.status = "uploading" + self._update_readme_status(model_source, results, models_dir, output_repo) + + def _upload_and_cleanup( + self, + output_repo: str, + file_path: Path, + quant_type: QuantisationType, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + ) -> None: + """Upload file and clean up (runs in background thread).""" + try: + logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})") + self.uploader.upload_model_file(output_repo, file_path) + logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully") + + self.file_cleanup.cleanup_quantisation_file(file_path) + + results[quant_type].status = "completed" + updated_readme_path = self.readme_generator.generate( + model_source, results, models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + + logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete") + + except Exception as e: + logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}") + results[quant_type].status = "failed" + results[quant_type].error_message = str(e) + + try: + updated_readme_path = self.readme_generator.generate( + model_source, results, models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + except Exception as readme_error: + logger.error( + f"[PARALLEL] Failed to update README after upload error: {readme_error}" + ) + + def _start_f16_upload( + self, + results: dict[QuantisationType, QuantisationResult], + model_source: ModelSource, + output_repo: str, + f16_model_path: Path, + upload_executor: ThreadPoolExecutor, + upload_futures: list, + ) -> None: + """Start F16 upload in background.""" + f16_result = results[QuantisationType.F16] + if f16_result.file_path and f16_result.file_path.exists(): + logger.info("Starting parallel upload of F16 GGUF...") + f16_result.status = "uploading" + self._update_readme_status( + model_source, results, f16_model_path.parent.parent, output_repo + ) + + upload_future = upload_executor.submit( + self._upload_f16_and_cleanup, + output_repo, + f16_result.file_path, + model_source, + results, + f16_model_path.parent.parent, + ) + upload_futures.append(upload_future) + + def _upload_f16_and_cleanup( + self, + output_repo: str, + file_path: Path, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + ) -> None: + """Upload F16 file and update status (runs in background thread).""" + try: + logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})") + self.uploader.upload_model_file(output_repo, file_path) + logger.info("[PARALLEL] Upload of F16 GGUF completed successfully") + + # Don't delete F16 yet - still needed for quantisations + + results[QuantisationType.F16].status = "completed" + updated_readme_path = self.readme_generator.generate( + model_source, results, models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + + logger.info("[PARALLEL] F16 upload complete") + + except Exception as e: + logger.error(f"[PARALLEL] Failed to upload F16: {e}") + results[QuantisationType.F16].status = "failed" + results[QuantisationType.F16].error_message = str(e) + + try: + updated_readme_path = self.readme_generator.generate( + model_source, results, models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + except Exception as readme_error: + logger.error( + f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}" + ) + + def _create_f16_result(self, f16_model_path: Path) -> QuantisationResult: + """Create a result object for F16 tracking. + + Returns: + QuantisationResult object for F16 tracking. + """ + f16_size = "-" + if f16_model_path.exists(): + size_bytes = f16_model_path.stat().st_size + size_gb = size_bytes / (1024**3) + f16_size = f"{size_gb:.1f}GB" + + # Create a simple result object for F16 tracking + return type( + "F16Result", + (), + { + "quantisation_type": "F16", + "success": True, + "status": "planned", + "file_path": f16_model_path, + "file_size": f16_size, + }, + )() + + def _wait_for_uploads(self, upload_futures: list) -> None: + """Wait for all parallel uploads to complete.""" + if not upload_futures: + return + + logger.info(f"Waiting for {len(upload_futures)} uploads to complete...") + completed = 0 + failed = 0 + + for future in upload_futures: + try: + future.result(timeout=300) # 5 minute timeout per upload + completed += 1 + logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed") + except Exception as e: + failed += 1 + logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}") + + self.progress_reporter.print_upload_summary(completed, failed) + + def _update_readme_status( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + output_repo: str, + ) -> None: + """Update README with current quantisation status using rate limiting.""" + if not self.no_upload: + # Use rate limiter to batch updates + self.readme_limiter.request_update( + self._do_readme_update, + model_source, + results, + models_dir, + output_repo, + ) + + def _do_readme_update( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + output_repo: str, + ) -> None: + """Actually perform the README update (called by rate limiter).""" + updated_readme_path = self.readme_generator.generate( + model_source, results, models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, updated_readme_path) + + def _final_readme_update( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + output_repo: str, + ) -> None: + """Perform final README update after all operations.""" + logger.info("Updating README with final status...") + final_readme = self.readme_generator.generate( + model_source, results, models_dir, output_repo + ) + self.uploader.upload_readme(output_repo, final_readme) diff --git a/helpers/quantisation/model_manager.py b/helpers/quantisation/model_manager.py new file mode 100644 index 0000000..f827578 --- /dev/null +++ b/helpers/quantisation/model_manager.py @@ -0,0 +1,422 @@ +"""Model acquisition and preparation management. + +Handles model downloading from HuggingFace and preparation for quantisation, +including format detection and conversion. +""" + +from __future__ import annotations + +import shutil +import subprocess +import traceback +from typing import TYPE_CHECKING + +from helpers.filesystem import FilesystemService +from helpers.gguf import GGUFConverter +from helpers.logger import logger +from helpers.models.quantisation import ModelSource +from helpers.utils.config_parser import ConfigParser +from helpers.utils.tensor_mapping import TensorMapper + +if TYPE_CHECKING: + from pathlib import Path + + +class ModelManager: + """Handles model downloading and preparation for quantisation. + + Manages both GGUF repository downloads and HuggingFace model conversions, + providing unified interface for model acquisition and preparation. + """ + + def __init__(self, models_dir: Path) -> None: + """Initialise model manager with storage configuration. + + Creates a new model manager instance that will handle model downloading, + format detection, and preparation for quantisation workflows using the + specified directory as the base storage location. + """ + self.models_dir = models_dir + self.fs = FilesystemService() + + def prepare_model(self, model_source: ModelSource) -> Path: + """Prepare model for quantisation and return F16 model path. + + Handles both GGUF repository downloads and regular HuggingFace model + conversion workflows with automatic format detection. Processes the + provided model source information to determine the optimal acquisition + strategy and ensures the model is in F16 GGUF format. + + Returns: + Path to F16 GGUF model ready for quantisation. + """ + model_dir = self.models_dir / model_source.model_name + + if model_source.is_gguf_repo: + return self._handle_gguf_repo(model_source, model_dir) + return self._handle_regular_repo(model_source, model_dir) + + def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path: + """Handle GGUF repository download with pattern matching. + + Downloads GGUF files matching specified patterns, prioritising + multi-part files and F16 variants. Uses the model source information + and target directory to efficiently locate and download appropriate + GGUF files from HuggingFace repositories. + + Returns: + Path to downloaded or existing GGUF file. + """ + logger.info(f"ā¬‡ļø Downloading GGUF file from repository: {model_source.source_model}") + logger.info(f"šŸ” Looking for file pattern: *{model_source.gguf_file_pattern}*") + + f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf" + + if f16_model.exists(): + logger.info(f"āœ… Found existing F16 file: {f16_model.name}") + return f16_model + + # Check for existing GGUF files + model_dir.mkdir(parents=True, exist_ok=True) + existing_gguf = self.fs.find_gguf_files(model_dir) + + if existing_gguf: + logger.info(f"āœ… Found existing GGUF file: {existing_gguf[0].name}") + return existing_gguf[0] + + # Download with patterns + downloaded_file = self._download_gguf_with_patterns( + model_source.source_model, model_source.gguf_file_pattern, model_dir + ) + + if downloaded_file: + # Handle multi-part files + if "00001-of-" in downloaded_file.name: + return downloaded_file + if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name: + base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace( + "-00003-of-", "-00001-of-" + ) + first_part = downloaded_file.parent / base_name + if first_part.exists(): + logger.info(f"šŸ”„ Using first part: {first_part.name}") + return first_part + + # Rename single file to standard name + downloaded_file.rename(f16_model) + return f16_model + + # Fallback to regular conversion + logger.info("šŸ’” Falling back to downloading full repository and converting...") + return self._handle_regular_repo( + ModelSource(**{**model_source.dict(), "is_gguf_repo": False}), + model_dir, + ) + + def _download_gguf_with_patterns( + self, source_model: str, pattern: str | None, model_dir: Path + ) -> Path | None: + """Download GGUF file using various pattern strategies. + + Tries multiple pattern variations to find and download appropriate + GGUF files, handling timeouts and temporary directories. Uses the + HuggingFace model identifier with an optional pattern to search for + specific files and downloads them to the target directory. + + Returns: + Path to downloaded file, or None if all patterns fail. + """ + if pattern: + patterns = [ + f"*{pattern}*", + f"*{pattern.lower()}*", + f"*{pattern.upper()}*", + "*f16*", + "*F16*", + "*fp16*", + ] + else: + patterns = ["*f16*", "*F16*", "*fp16*"] + + temp_dir = model_dir / "gguf_temp" + + for search_pattern in patterns: + logger.info(f"šŸ” Trying pattern: {search_pattern}") + temp_dir.mkdir(exist_ok=True) + + try: + logger.debug( + f"DEBUG: Running huggingface-cli download for pattern {search_pattern}" + ) + result = subprocess.run( + [ + "timeout", + "300", + "huggingface-cli", + "download", + source_model, + "--include", + search_pattern, + "--local-dir", + str(temp_dir), + ], + check=True, + capture_output=True, + text=True, + ) + logger.debug( + f"DEBUG: Download command completed with return code {result.returncode}" + ) + + # Find downloaded GGUF files + gguf_files = self.fs.find_gguf_files(temp_dir, pattern) + if gguf_files: + found_file = gguf_files[0] + logger.info(f"āœ… Found GGUF file: {found_file.name}") + + # Move to parent directory + final_path = model_dir / found_file.name + shutil.move(str(found_file), str(final_path)) + shutil.rmtree(temp_dir) + return final_path + + except subprocess.CalledProcessError as e: + logger.debug( + f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}" + ) + if e.stderr: + logger.debug(f"DEBUG: stderr: {e.stderr}") + if e.stdout: + logger.debug(f"DEBUG: stdout: {e.stdout}") + logger.info(f"āš ļø Pattern {search_pattern} failed or timed out") + continue + except Exception as e: + logger.error(f"āŒ Unexpected error during download: {e}") + logger.error("Exception traceback:") + for line in traceback.format_exc().splitlines(): + logger.error(f" {line}") + continue + finally: + if temp_dir.exists(): + shutil.rmtree(temp_dir, ignore_errors=True) + + return None + + def _handle_regular_repo( + self, + model_source: ModelSource, + model_dir: Path, + ) -> Path: + """Handle regular HuggingFace repository conversion. + + Downloads full model repository and converts to F16 GGUF format + using our native Python-based GGUFConverter for SafeTensors models. + Processes the model source information and uses the local directory + for storage during the download and conversion workflow. + + Returns: + Path to converted F16 GGUF model. + """ + logger.info(f"ā¬‡ļø Downloading source model: {model_source.source_model}") + + # Download model if needed + if not model_dir.exists(): + self._download_repository(model_source.source_model, model_dir) + else: + logger.info("āœ… Model already downloaded") + + # Convert to GGUF + return self._convert_to_gguf(model_source, model_dir) + + def _setup_download_directories(self, model_dir: Path) -> None: + """Set up directories for model download. + + Creates the necessary directory structure for model downloads, + including the base model directory and HuggingFace metadata + directory to ensure proper organisation of downloaded assets. + """ + model_dir.mkdir(parents=True, exist_ok=True) + huggingface_dir = model_dir / ".huggingface" + huggingface_dir.mkdir(parents=True, exist_ok=True) + + def _create_download_process(self, source_model: str, model_dir: Path) -> subprocess.Popen: + """Create subprocess for downloading repository. + + Initiates a HuggingFace CLI download process for the specified model + identifier, configuring it to download to the local directory whilst + excluding existing GGUF files to avoid conflicts. + + Returns: + Subprocess for downloading. + """ + return subprocess.Popen( + [ + "huggingface-cli", + "download", + source_model, + "--local-dir", + str(model_dir), + "--exclude", + "*.gguf", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, # Line buffered + universal_newlines=True, + ) + + def _stream_download_output(self, process: subprocess.Popen) -> None: + """Stream download process output with appropriate logging levels. + + Monitors the download subprocess output and routes progress information + to appropriate log levels, providing real-time feedback on download + progress whilst filtering debug information appropriately. + """ + if process.stdout: + for line in process.stdout: + # Log download progress lines + if line.strip(): + # Check if it's a progress line (contains %) + if "%" in line or "Downloading" in line or "Fetching" in line: + # Use info level for progress lines + logger.info(f" {line.strip()}") + else: + # Use debug for other output + logger.debug(f" {line.strip()}") + + def _handle_download_errors(self, source_model: str, e: Exception) -> None: + """Handle download errors with detailed logging. + + Processes download exceptions for the specified model, providing + comprehensive error logging including return codes, stderr, and + stdout information to aid in debugging download failures. + + Raises: + TypeError: Always raised with appropriate error message. + """ + if isinstance(e, subprocess.CalledProcessError): + logger.error(f"āŒ Failed to download repository {source_model}") + logger.error(f"Return code: {e.returncode}") + if e.stderr: + logger.error(f"stderr: {e.stderr}") + if e.stdout: + logger.error(f"stdout: {e.stdout}") + msg = f"Repository download failed: {e}" + raise TypeError(msg) from e + logger.error(f"āŒ Unexpected error during repository download: {e}") + logger.error("Exception traceback:") + for line in traceback.format_exc().splitlines(): + logger.error(f" {line}") + msg = f"Repository download failed: {e}" + raise TypeError(msg) from e + + def _download_repository(self, source_model: str, model_dir: Path) -> None: + """Download HuggingFace repository. + + Orchestrates the complete repository download workflow for the + specified HuggingFace model, managing directory setup, process + execution, and error handling to ensure robust model acquisition. + + Raises: + RuntimeError: If download fails. + """ + self._setup_download_directories(model_dir) + + try: + logger.info(f"ā¬‡ļø Downloading full repository: {source_model}") + logger.info("šŸ“Š Progress will be shown below...") + + process = self._create_download_process(source_model, model_dir) + self._stream_download_output(process) + + # Wait for process to complete + return_code = process.wait() + + if return_code != 0: + msg = f"Repository download failed with return code {return_code}" + raise RuntimeError(msg) + + logger.info("āœ… Repository download completed successfully") + + except Exception as e: + self._handle_download_errors(source_model, e) + + def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path: + """Convert model to GGUF F16 format. + + Converts SafeTensors models to GGUF F16 format using our native + Python converter. Processes model source information and the + directory containing downloaded model files, handling architecture + detection and tensor mapping for optimal compatibility. + + Returns: + Path to F16 GGUF model. + + Raises: + RuntimeError: If conversion fails. + """ + logger.info("šŸ”„ Converting to GGUF F16 format...") + f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf" + + if f16_model.exists(): + logger.info("āœ… F16 model already exists") + return f16_model + + # Check for SafeTensors files + safetensor_files = list(model_dir.glob("*.safetensors")) + if not safetensor_files: + logger.error("āŒ Model format not supported") + logger.info("šŸ’” This tool supports GGUF and SafeTensors formats") + msg = "Model must be in GGUF or SafeTensors format" + raise RuntimeError(msg) + + logger.info("šŸ Using native Python GGUFConverter...") + logger.info(f"āœ… Found {len(safetensor_files)} SafeTensors files") + + # Load model configuration + config_parser = ConfigParser() + model_config = config_parser.load_model_config(model_dir) + + # Get architecture mapping + arch_name = model_config.architectures[0] if model_config.architectures else "llama" + arch = config_parser.get_architecture_mapping(arch_name) + + if arch != arch_name: + logger.info(f"šŸ“ Architecture mapping: {arch_name} → {arch}") + + # Check if architecture is supported by llama.cpp + supported_archs = { + "llama", + "qwen2", + "gemma", + "phi3", + "falcon", + "gpt2", + "gptj", + "gptneox", + "mpt", + "baichuan", + "stablelm", + } + + if arch not in supported_archs: + logger.warning("=" * 70) + logger.warning(f"āš ļø Architecture '{arch_name}' may not be supported by llama.cpp") + logger.warning(f"āš ļø The GGUF will be created with architecture: '{arch}'") + logger.warning("āš ļø Check if your inference software supports this architecture.") + logger.warning("=" * 70) + + # Convert using GGUFConverter + tensor_mapper = TensorMapper() + success = GGUFConverter.convert_safetensors( + model_dir, f16_model, model_config, arch, tensor_mapper + ) + + if not success: + logger.error("āŒ Native Python conversion failed") + msg = "Failed to convert SafeTensors model to GGUF" + raise RuntimeError(msg) + + logger.info("āœ… Native Python conversion successful") + return f16_model diff --git a/helpers/quantisation/orchestrator.py b/helpers/quantisation/orchestrator.py new file mode 100644 index 0000000..a300c6a --- /dev/null +++ b/helpers/quantisation/orchestrator.py @@ -0,0 +1,229 @@ +"""Main quantisation orchestrator. + +Provides the high-level orchestration of the complete quantisation +workflow, coordinating between various services and modules. +""" + +from __future__ import annotations + +import signal +import sys +import traceback +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING + +from helpers.filesystem import FileCleanup, WorkspaceManager +from helpers.huggingface import HuggingFaceUploader +from helpers.llama_cpp import IMatrixGenerator, IMatrixHandler +from helpers.logger import logger +from helpers.models.quantisation import QuantisationResult, QuantisationType +from helpers.quantisation.engine import QuantisationEngine +from helpers.quantisation.executor import QuantisationExecutor +from helpers.quantisation.model_manager import ModelManager +from helpers.quantisation.profile_manager import ProfileManager +from helpers.quantisation.progress import ProgressReporter +from helpers.readme import ReadmeGenerator +from helpers.utils.rate_limiter import ReadmeRateLimiter +from helpers.utils.tensor_mapping import URLParser + +if TYPE_CHECKING: + from types import FrameType + + from helpers.models.quantisation import ModelSource + + +@dataclass(slots=True) +class QuantisationOrchestrator: + """Orchestrates the complete quantisation workflow. + + Thin coordinator that delegates to specialised services for + each aspect of the quantisation workflow. + """ + + work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work") + use_imatrix: bool = True + no_upload: bool = False + custom_profiles: list[str] | None = None + + # Service dependencies + url_parser: URLParser = field(default_factory=URLParser) + workspace_manager: WorkspaceManager = field(init=False) + model_manager: ModelManager = field(init=False) + profile_manager: ProfileManager = field(default_factory=ProfileManager) + progress_reporter: ProgressReporter = field(default_factory=ProgressReporter) + quantisation_executor: QuantisationExecutor = field(init=False) + imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler) + imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator) + readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator) + uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader) + file_cleanup: FileCleanup = field(default_factory=FileCleanup) + readme_limiter: ReadmeRateLimiter = field(init=False) + + def __post_init__(self) -> None: + """Initialise computed properties after dataclass construction.""" + self.workspace_manager = WorkspaceManager(self.work_dir) + self.model_manager = ModelManager(self.workspace_manager.models_dir) + self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0) + + # Create executor with dependencies + self.quantisation_executor = QuantisationExecutor( + quantisation_engine=QuantisationEngine(), + uploader=self.uploader, + readme_generator=self.readme_generator, + file_cleanup=self.file_cleanup, + no_upload=self.no_upload, + ) + + # Set up signal handlers + self._setup_signal_handlers() + + def _setup_signal_handlers(self) -> None: + """Set up signal handlers to catch unexpected exits.""" + + def signal_handler(signum: int, frame: FrameType | None) -> None: + logger.error(f"āŒ Received signal {signum} ({signal.Signals(signum).name})") + logger.error("Stack trace at signal:") + if frame: + for line in traceback.format_stack(frame): + logger.error(f" {line.strip()}") + logger.error("Exiting due to signal") + sys.exit(1) + + # Handle common termination signals + for sig in [signal.SIGINT, signal.SIGTERM]: + signal.signal(sig, signal_handler) + + def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]: + """Main quantisation workflow orchestrating model processing from URL to upload. + + Coordinates the complete quantisation process from URL parsing through + model downloading, quantisation execution, and upload to HuggingFace. + Handles architecture compatibility and provides comprehensive error handling. + + Returns: + Dictionary of quantisation results by type. + + Raises: + KeyboardInterrupt: If the user interrupts the quantisation process. + """ + logger.info("Starting Bartowski quantisation process...") + logger.debug(f"DEBUG: Input URL: {url}") + logger.debug(f"DEBUG: Working directory: {self.work_dir}") + logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}") + logger.debug(f"DEBUG: No upload: {self.no_upload}") + logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}") + + try: + # Setup and preparation + model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url) + + # Create initial repository + self._create_initial_repository(model_source, output_repo) + + # Get quantisation types + quantisation_types = self.profile_manager.get_quantisation_types(self.custom_profiles) + + # Filter by architecture if needed + supported_types, unsupported_types = self.profile_manager.filter_by_architecture( + quantisation_types, f16_model_path + ) + + # Pre-mark unsupported types + results: dict[QuantisationType, QuantisationResult] = {} + for quant_type in unsupported_types: + results[quant_type] = QuantisationResult( + quantisation_type=quant_type, + success=False, + status="failed", + error_message="K-quant requires llama.cpp architecture support", + ) + + # Execute quantisations + execution_results = self.quantisation_executor.execute_quantisations( + model_source, + f16_model_path, + imatrix_path, + output_repo, + supported_types, + self.workspace_manager.models_dir, + ) + results.update(execution_results) + + # Cleanup + self.file_cleanup.cleanup_files( + f16_model_path, model_source, self.workspace_manager.models_dir + ) + + # Print summary + self.progress_reporter.print_completion_summary(model_source, results, output_repo) + + except KeyboardInterrupt: + logger.error("āŒ Process interrupted by user (Ctrl+C)") + raise + except Exception as e: + logger.error(f"āŒ Critical error in quantisation workflow: {e}") + logger.error("Full traceback:") + for line in traceback.format_exc().splitlines(): + logger.error(f" {line}") + raise + finally: + # Always flush pending README updates before exiting + self.readme_limiter.flush() + + return results + + def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]: + """Setup environment and prepare model for quantisation. + + Returns: + Tuple of (model_source, f16_model_path, imatrix_path, output_repo). + """ + model_source = self.url_parser.parse(url) + self.progress_reporter.print_model_info( + model_source, self.uploader.get_username(), str(self.work_dir) + ) + + f16_model_path = self.model_manager.prepare_model(model_source) + + output_repo = ( + f"{self.uploader.get_username()}/" + f"{model_source.original_author}-{model_source.model_name}-GGUF" + ) + + imatrix_path = None + if self.use_imatrix: + logger.info("Checking for importance matrix (imatrix)...") + model_dir = self.workspace_manager.get_model_dir(model_source.model_name) + imatrix_path = self.imatrix_handler.find_imatrix(model_dir) + + # If no imatrix found, offer to generate or provide one + if not imatrix_path: + # First offer to generate + imatrix_path = self.imatrix_generator.prompt_for_generation( + model_source, model_dir, f16_model_path + ) + + # If generation was skipped, offer to provide existing one + if not imatrix_path: + imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir) + + return model_source, f16_model_path, imatrix_path, output_repo + + def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None: + """Create initial repository with planned quantisations.""" + logger.info("Creating initial README with planned quantisations...") + quantisation_types = self.profile_manager.get_quantisation_types(self.custom_profiles) + planned_results = { + qt: QuantisationResult(quantisation_type=qt, success=False, status="planned") + for qt in quantisation_types + } + readme_path = self.readme_generator.generate( + model_source, planned_results, self.workspace_manager.models_dir, output_repo + ) + + if not self.no_upload: + logger.info("Creating repository with planned quantisations...") + self.uploader.upload_readme(output_repo, readme_path) + else: + logger.info("Skipping repository creation (--no-upload specified)") diff --git a/helpers/quantisation/profile_manager.py b/helpers/quantisation/profile_manager.py new file mode 100644 index 0000000..79bd1f1 --- /dev/null +++ b/helpers/quantisation/profile_manager.py @@ -0,0 +1,132 @@ +"""Quantisation profile management. + +Manages selection and validation of quantisation types based on +user preferences, architecture support, and configuration. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from helpers.config.quantisation_configs import ( + DEFAULT_QUANTISATION_TYPES, + SUPPORTED_QUANTISATION_TYPES, +) +from helpers.llama_cpp.architecture import ArchitectureDetector +from helpers.logger import logger +from helpers.models.quantisation import QuantisationType + +if TYPE_CHECKING: + from pathlib import Path + + +class ProfileManager: + """Manages quantisation profiles and type selection. + + Handles selection of quantisation types based on custom profiles, + architecture support, and fallback to defaults. + """ + + @staticmethod + def get_quantisation_types( + custom_profiles: list[str] | None = None, + ) -> list[QuantisationType]: + """Get the quantisation types to use for this run. + + Determines which quantisation types should be processed based on + custom profiles provided by the user, or falls back to default + configurations if no custom profiles are specified. + + Returns: + List of QuantisationType enums to process. + """ + if custom_profiles: + return ProfileManager._parse_custom_profiles(custom_profiles) + return DEFAULT_QUANTISATION_TYPES + + @staticmethod + def _parse_custom_profiles(profile_strings: list[str]) -> list[QuantisationType]: + """Parse custom profile strings to QuantisationType enums. + + Validates and converts user-provided profile strings into proper + QuantisationType enumerations, filtering out invalid or unsupported + types whilst logging warnings for problematic entries. + + Returns: + List of valid QuantisationType enums. + """ + result = [] + for profile_str in profile_strings: + try: + profile = QuantisationType(profile_str.upper()) + if profile in SUPPORTED_QUANTISATION_TYPES: + result.append(profile) + else: + logger.warning(f"Profile {profile_str} is not supported, skipping") + except ValueError: + logger.warning(f"Invalid profile {profile_str}, skipping") + + # Fall back to defaults if no valid profiles + return result or DEFAULT_QUANTISATION_TYPES + + @staticmethod + def filter_by_architecture( + quantisation_types: list[QuantisationType], + f16_model_path: Path, + ) -> tuple[list[QuantisationType], list[QuantisationType]]: + """Filter quantisation types based on architecture support. + + Analyses the F16 GGUF model to determine architecture compatibility + and filters the requested quantisation types accordingly. Separates + supported types from unsupported ones, especially filtering K-quants + for architectures not supported by llama.cpp. + + Returns: + Tuple of (supported_types, unsupported_types). + """ + if not ArchitectureDetector.check_architecture_support(f16_model_path): + # Architecture not supported - filter out K-quants + basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] + supported = [] + unsupported = [] + + for quant_type in quantisation_types: + if quant_type.value in basic_types: + supported.append(quant_type) + else: + unsupported.append(quant_type) + + if unsupported: + logger.warning( + "āš ļø Architecture not supported by llama.cpp - K-quants will be skipped" + ) + logger.info("šŸ’” Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated") + + return supported, unsupported + + # All types supported + return quantisation_types, [] + + @staticmethod + def validate_profiles(profiles: list[str]) -> list[str]: + """Validate a list of profile strings. + + Checks each profile string to ensure it corresponds to a valid + and supported quantisation type, logging warnings for invalid + entries whilst returning only the valid profile strings. + + Returns: + List of valid profile strings. + """ + valid = [] + for profile in profiles: + try: + quant_type = QuantisationType(profile.upper()) + if quant_type in SUPPORTED_QUANTISATION_TYPES: + valid.append(profile) + else: + logger.warning(f"Profile {profile} exists but is not supported") + except ValueError: + logger.warning(f"Profile {profile} is not a valid quantisation type") + + return valid diff --git a/helpers/quantisation/progress.py b/helpers/quantisation/progress.py new file mode 100644 index 0000000..84cf62e --- /dev/null +++ b/helpers/quantisation/progress.py @@ -0,0 +1,151 @@ +"""Progress tracking and reporting for quantisation workflow. + +Provides utilities for tracking quantisation progress, generating +status reports, and displaying completion summaries. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from helpers.logger import logger + +if TYPE_CHECKING: + from helpers.models.quantisation import ModelSource, QuantisationResult, QuantisationType + + +class ProgressReporter: + """Reports progress and status of quantisation operations. + + Provides methods for displaying model information, progress updates, + and completion summaries throughout the quantisation workflow. + """ + + @staticmethod + def print_model_info(model_source: ModelSource, username: str, work_dir: str) -> None: + """Print model information at start of processing. + + Displays comprehensive information about the model being processed, + including source details, author information, and working directory + to provide clear context at the beginning of quantisation workflows. + """ + logger.info(f"Source URL: {model_source.url}") + logger.info(f"Source model: {model_source.source_model}") + logger.info(f"Original author: {model_source.original_author}") + logger.info(f"Model name: {model_source.model_name}") + logger.info(f"Your HF username: {username}") + logger.info(f"Working directory: {work_dir}") + + @staticmethod + def print_quantisation_start( + index: int, + total: int, + quant_type: str, + ) -> None: + """Print message when starting a quantisation. + + Displays progress information showing which quantisation is currently + being processed within the overall batch, providing clear feedback + about workflow advancement and the specific type being quantised. + """ + logger.info(f"Processing quantisation {index}/{total}: {quant_type}") + + @staticmethod + def print_completion_summary( + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + output_repo: str, + ) -> None: + """Print completion summary with results. + + Generates comprehensive completion report showing successful quantisations, + file information, and repository links. Provides detailed feedback on + the overall quantisation workflow outcome and model availability. + """ + successful_results = [r for r in results.values() if r.success] + + if successful_results: + logger.info("Complete! Your quantised models are available at:") + logger.info(f" https://huggingface.co/{output_repo}") + logger.info("Model info:") + logger.info(f" - Source URL: {model_source.url}") + logger.info(f" - Original: {model_source.source_model}") + logger.info( + " - Method: " + f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}" + ) + logger.info(f" - Quantised: {output_repo}") + + for result in successful_results: + if result.file_size: + filename = ( + f"{model_source.original_author}-{model_source.model_name}-" + f"{result.quantisation_type}.gguf" + ) + logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})") + else: + logger.error( + "All quantisations failed - repository created with documentation " + "but no model files" + ) + logger.error(f" Repository: https://huggingface.co/{output_repo}") + + @staticmethod + def print_upload_summary(completed: int, failed: int) -> None: + """Print upload completion summary. + + Reports the final upload statistics showing successful and failed + uploads with appropriate warning or success messaging based on + the outcome of the upload batch process. + """ + if failed > 0: + logger.warning(f"Upload summary: {completed} succeeded, {failed} failed") + else: + logger.info(f"All {completed} uploads completed successfully") + + @staticmethod + def print_architecture_warning() -> None: + """Print warning about unsupported architecture.""" + logger.warning("āš ļø Architecture not supported by llama.cpp - K-quants will be skipped") + logger.info("šŸ’” Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated") + + @staticmethod + def get_status_emoji(status: str) -> str: + """Get emoji for a given status. + + Maps status strings to appropriate emoji representations for enhanced + visual feedback in progress reporting. Provides a default emoji for + unknown status values to maintain consistent display formatting. + + Returns: + Appropriate emoji for the status. + """ + status_emojis = { + "planned": "šŸ“‹", + "processing": "āš™ļø", + "uploading": "šŸ“¤", + "completed": "āœ…", + "failed": "āŒ", + } + return status_emojis.get(status, "ā“") + + @staticmethod + def format_progress_bar(current: int, total: int, width: int = 30) -> str: + """Format a text progress bar. + + Creates a visual progress representation using Unicode block characters + with percentage display. Handles edge cases like zero totals and + calculates appropriate fill ratios for the specified width. + + Returns: + Formatted progress bar string. + """ + if total == 0: + return "[" + " " * width + "]" + + progress = int((current / total) * width) + filled = "ā–ˆ" * progress + empty = "ā–‘" * (width - progress) + percentage = (current / total) * 100 + + return f"[{filled}{empty}] {percentage:.1f}%" diff --git a/helpers/readme/__init__.py b/helpers/readme/__init__.py new file mode 100644 index 0000000..eb6b9eb --- /dev/null +++ b/helpers/readme/__init__.py @@ -0,0 +1,23 @@ +"""README generation for quantised models. + +Provides utilities for generating comprehensive documentation including +model cards, quantisation tables, and status tracking. +""" + +from __future__ import annotations + +from helpers.readme.formatter import ( + FileSizeFormatter, + StatusFormatter, + TableFormatter, + TagFormatter, +) +from helpers.readme.generator import ReadmeGenerator + +__all__ = [ + "FileSizeFormatter", + "ReadmeGenerator", + "StatusFormatter", + "TableFormatter", + "TagFormatter", +] diff --git a/helpers/readme/formatter.py b/helpers/readme/formatter.py new file mode 100644 index 0000000..b90c399 --- /dev/null +++ b/helpers/readme/formatter.py @@ -0,0 +1,265 @@ +"""README formatting utilities. + +Provides formatters for status indicators, tables, and other README elements. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from helpers.config.quantisation_configs import QUANTISATION_CONFIGS +from helpers.models.quantisation import QuantisationResult, QuantisationType + +if TYPE_CHECKING: + from pathlib import Path + + from helpers.models.quantisation import ModelSource + +# File size constant +GIBIBYTE = 1024**3 + + +class StatusFormatter: + """Formats status indicators for README tables.""" + + @staticmethod + def format_status( + result: QuantisationResult, + model_source: ModelSource, + quant_type: QuantisationType, + output_repo: str | None, + ) -> str: + """Format status indicator for README table. + + Creates appropriate status indicator based on quantisation state + including progress indicators, file sizes, and download links. + + Returns: + Formatted status string for table cell. + """ + status_map = { + "planned": "ā³ Queued", + "processing": "šŸ”„ Processing...", + "uploading": "ā¬†ļø Uploading...", + "failed": "āŒ Failed", + } + + if hasattr(result, "status") and result.status in status_map: + base_status = status_map[result.status] + + # Check for architecture not supported error + if ( + result.status == "failed" + and hasattr(result, "error_message") + and result.error_message + and "architecture not supported" in str(result.error_message).lower() + ): + return "āš ļø Skipped" + + if result.status == "uploading" and hasattr(result, "file_size") and result.file_size: + return f"{base_status} ({result.file_size})" + + if result.status == "completed" or (hasattr(result, "success") and result.success): + return StatusFormatter.format_success_status( + result, model_source, quant_type, output_repo + ) + + return base_status + + # Legacy support + if hasattr(result, "success") and result.success: + return StatusFormatter.format_success_status( + result, model_source, quant_type, output_repo + ) + + return "āŒ Failed" + + @staticmethod + def format_success_status( + result: QuantisationResult, + model_source: ModelSource, + quant_type: QuantisationType, + output_repo: str | None, + ) -> str: + """Format successful quantisation status with download link. + + Creates a download link if repository information is available, + otherwise shows file size. + + Returns: + Formatted success status string. + """ + if not output_repo: + return ( + f"āœ… {result.file_size}" + if hasattr(result, "file_size") and result.file_size + else "āœ… Available" + ) + + filename = ( + f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf" + ) + url = f"https://huggingface.co/{output_repo}?show_file_info={filename}" + + if hasattr(result, "file_size") and result.file_size: + return f"[āœ… {result.file_size}]({url})" + + return f"[āœ… Available]({url})" + + +class TableFormatter: + """Formats quantisation tables for README.""" + + @staticmethod + def get_ordered_quantisation_types() -> list[QuantisationType]: + """Get quantisation types in display order. + + Returns types ordered by precision level and variant. + + Returns: + Ordered list of quantisation types. + """ + return [ + # Q3 K-quants + QuantisationType.Q3_K_M, + QuantisationType.Q3_K_L, + QuantisationType.Q3_K_XL, + # Q4 types + QuantisationType.Q4_0, # Basic + QuantisationType.Q4_K_M, + QuantisationType.Q4_K_L, + # Q5 types + QuantisationType.Q5_0, # Basic + QuantisationType.Q5_K_M, + QuantisationType.Q5_K_L, + # Q6 types + QuantisationType.Q6_0, # Basic + QuantisationType.Q6_K, + QuantisationType.Q6_K_L, + # Q8 types + QuantisationType.Q8_0, # Basic + QuantisationType.Q8_K, + ] + + @staticmethod + def format_quantisation_row( + quant_type: QuantisationType, + result: QuantisationResult | None, + model_source: ModelSource, + output_repo: str | None, + ) -> str: + """Format a single quantisation table row. + + Creates a formatted table row for the README displaying quantisation + type, configuration details, and status information. Handles cases + where no result is available by creating a default planned result. + + Returns: + Formatted table row string. + """ + # Create default result if none exists + if result is None: + result = QuantisationResult( + quantisation_type=quant_type, success=False, status="planned" + ) + + # Get configuration + config = QUANTISATION_CONFIGS.get(quant_type) + + # Format status + status_formatter = StatusFormatter() + status = status_formatter.format_status(result, model_source, quant_type, output_repo) + + # Get configuration description + config_desc = ( + config.get_compact_config(QUANTISATION_CONFIGS) + if config + else f"{quant_type} all layers" + ) + + return f"| **{quant_type.value}** | {config_desc} | {status} |\n" + + +class TagFormatter: + """Formats tags for README frontmatter.""" + + @staticmethod + def build_tags( + results: dict[QuantisationType, QuantisationResult], + original_tags: list[str] | None = None, + ) -> list[str]: + """Build tags based on quantisation results. + + Generates appropriate tags for the model repository based on + successful quantisations and combines them with any original + tags from the source model to create a comprehensive tag list. + + Returns: + Sorted list of unique tags. + """ + our_tags = ["gguf"] + + # Add tags for successful quantisations + for quant_type, result in results.items(): + if hasattr(result, "status") and result.status == "completed": + if quant_type == QuantisationType.F16: + our_tags.append("f16") + elif hasattr(result, "quantisation_type"): + # Convert to lowercase tag format + our_tags.append(result.quantisation_type.value.lower()) + + # Check for F16 availability + if ( + len(our_tags) == 1 + and QuantisationType.F16 in results + and hasattr(results[QuantisationType.F16], "status") + and results[QuantisationType.F16].status in {"completed", "uploading"} + ): + our_tags.append("f16") + + # Combine with original tags + all_tags = our_tags + if original_tags: + all_tags = sorted(set(our_tags + original_tags)) + + return all_tags + + +class FileSizeFormatter: + """Formats file sizes for display.""" + + @staticmethod + def format_size_bytes(size_bytes: int) -> str: + """Format bytes to human-readable size. + + Converts raw byte values into human-readable format using appropriate + units (B, KB, MB, GB) with decimal precision for larger values to + provide clear file size information in documentation. + + Returns: + Formatted size string (e.g., "4.5GB"). + """ + if size_bytes < 1024: + return f"{size_bytes}B" + if size_bytes < 1024**2: + return f"{size_bytes / 1024:.1f}KB" + if size_bytes < GIBIBYTE: + return f"{size_bytes / (1024**2):.1f}MB" + return f"{size_bytes / GIBIBYTE:.1f}GB" + + @staticmethod + def get_file_size(file_path: Path) -> str: + """Get formatted file size from path. + + Retrieves file size information from the filesystem and formats + it into human-readable format. Handles non-existent files gracefully + by returning a placeholder string for missing files. + + Returns: + Formatted size string or "-" if file doesn't exist. + """ + if not file_path.exists(): + return "-" + + size_bytes = file_path.stat().st_size + return FileSizeFormatter.format_size_bytes(size_bytes) diff --git a/helpers/readme/generator.py b/helpers/readme/generator.py new file mode 100644 index 0000000..d4fb990 --- /dev/null +++ b/helpers/readme/generator.py @@ -0,0 +1,311 @@ +"""README generation for quantised models. + +Coordinates README creation by combining templates, formatting, and +original model information. +""" + +from __future__ import annotations + +import json +import re +from typing import TYPE_CHECKING + +from helpers.logger import logger +from helpers.models.quantisation import QuantisationType +from helpers.readme.formatter import ( + FileSizeFormatter, + TableFormatter, + TagFormatter, +) +from helpers.readme.templates import ( + get_f16_row_template, + get_frontmatter_template, + get_header_template, + get_original_model_section, + get_quantisation_info, +) +from helpers.utils.config_parser import ConfigParser + +if TYPE_CHECKING: + from pathlib import Path + + from helpers.models.quantisation import ModelSource, QuantisationResult + +# File size constant +GIBIBYTE = 1024**3 + + +class ReadmeGenerator: + """Generates README files for quantised models. + + Creates comprehensive README documentation including model cards, + quantisation details, and status tracking. Supports both initial + planning documentation and final result summaries. + """ + + def generate( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + models_dir: Path, + output_repo: str | None = None, + ) -> Path: + """Generate README file for quantised model repository. + + Creates a comprehensive README with frontmatter, quantisation table, + and original model information. Handles status tracking for planned, + processing, and completed quantisations. + + Returns: + Path to generated README file. + """ + logger.info("Creating model card...") + model_dir = models_dir / model_source.model_name + readme_path = model_dir / "README.md" + + # Get original README content + original_content = self._get_original_readme(model_source, model_dir) + + # Generate new README + readme_content = self._generate_readme_content( + model_source, results, original_content, output_repo, models_dir + ) + + readme_path.write_text(readme_content) + return readme_path + + def _get_architecture(self, model_dir: Path) -> str | None: + """Get the architecture from the model's config.json. + + Returns: + Architecture name or None if not found. + """ + config_path = model_dir / "config.json" + if not config_path.exists(): + return None + + try: + with config_path.open(encoding="utf-8") as f: + config = json.load(f) + + # Get the architectures field - it's a list + architectures = config.get("architectures", []) + if architectures: + arch_name = architectures[0] + # Get the mapped architecture (what it will be converted to) + parser = ConfigParser() + mapped_arch = parser.get_architecture_mapping(arch_name) + logger.info(f"Architecture: {arch_name} -> {mapped_arch}") + return mapped_arch + + except Exception as e: + logger.warning(f"Could not determine architecture: {e}") + + return None + + def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]: + """Extract original README and metadata. + + Downloads or reads the original model's README for inclusion in the + quantised model documentation. Parses YAML frontmatter if present. + + Returns: + Dictionary with readme content, licence, tags, and frontmatter. + """ + content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""} + + # Check for preserved original README first + original_readme_path = model_dir / "README.original.md" + readme_path = model_dir / "README.md" + + if original_readme_path.exists(): + # Use the preserved original + content["readme"] = original_readme_path.read_text(encoding="utf-8") + logger.info(f"Found preserved original README ({len(content['readme'])} characters)") + elif readme_path.exists(): + # First time - preserve the original and use it + readme_content = readme_path.read_text(encoding="utf-8") + + # Check if this is already our generated README + if ( + f"{model_source.original_author}-{model_source.model_name}-GGUF" + not in readme_content + ): + # This is the original - preserve it + original_readme_path.write_text(readme_content) + content["readme"] = readme_content + logger.info(f"Preserved original README ({len(readme_content)} characters)") + else: + # This is our README, try to extract original content + logger.info("Found existing generated README, extracting original content") + # Try to find the separator + separator_idx = readme_content.find("\n---\n\n## Original Model Information\n") + if separator_idx > 0: + content["readme"] = readme_content[separator_idx + 37 :] + else: + logger.info("No README found to preserve") + + # Parse frontmatter if we have content + if content["readme"]: + parsed = self._parse_frontmatter(content["readme"]) + content.update(parsed) + + return content + + def _parse_frontmatter(self, readme_text: str) -> dict[str, str]: + """Parse YAML frontmatter from README. + + Extracts metadata from YAML frontmatter including licence, tags, + and other model card fields. + + Returns: + Dictionary with separated content and metadata. + """ + lines = readme_text.split("\n") + if lines[0] != "---": + return { + "readme": readme_text, + "licence": "apache-2.0", + "tags": "", + "frontmatter": "", + } + + frontmatter_end = -1 + for i, line in enumerate(lines[1:], 1): + if line == "---": + frontmatter_end = i + break + + if frontmatter_end == -1: + return { + "readme": readme_text, + "licence": "apache-2.0", + "tags": "", + "frontmatter": "", + } + + frontmatter = "\n".join(lines[1:frontmatter_end]) + content = "\n".join(lines[frontmatter_end + 1 :]) + + # Extract licence + licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE) + licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0" + + # Extract tags + tags = [] + in_tags = False + for line in frontmatter.split("\n"): + if line.startswith("tags:"): + in_tags = True + continue + if in_tags: + if line.startswith("- "): + tags.append(line[2:].strip()) + elif line and not line.startswith(" "): + break + + return { + "readme": content, + "licence": licence_val, + "tags": ",".join(tags), + "frontmatter": frontmatter, + } + + def _generate_readme_content( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + original_content: dict[str, str], + output_repo: str | None = None, + models_dir: Path | None = None, + ) -> str: + """Generate complete README content with quantisation details. + + Creates the full README including YAML frontmatter, quantisation status + table, and original model information. + + Returns: + Complete README markdown content. + """ + # Build tags + tag_formatter = TagFormatter() + original_tags = original_content["tags"].split(",") if original_content["tags"] else [] + all_tags = tag_formatter.build_tags(results, original_tags) + + # Build frontmatter + content = get_frontmatter_template( + original_content["licence"], + model_source.source_model, + all_tags, + ) + + # Add header + content += get_header_template( + model_source.original_author, + model_source.model_name, + model_source.source_model, + ) + + # Add quantisation table + table_formatter = TableFormatter() + for quant_type in table_formatter.get_ordered_quantisation_types(): + result = results.get(quant_type) + content += table_formatter.format_quantisation_row( + quant_type, result, model_source, output_repo + ) + + # Add F16 row if applicable + if not model_source.is_gguf_repo and output_repo: + content += self._format_f16_row(model_source, results, output_repo, models_dir) + + # Add quantisation information + content += get_quantisation_info() + + # Add original model section if available + if original_content.get("readme"): + content += get_original_model_section(original_content["readme"]) + + return content + + def _format_f16_row( + self, + model_source: ModelSource, + results: dict[QuantisationType, QuantisationResult], + output_repo: str, + models_dir: Path | None = None, + ) -> str: + """Format F16 GGUF row for the table. + + Creates a properly formatted F16 reference row for the quantisation + table using source model information, results data, and repository + details with optional models directory for file size calculation. + + Returns: + Formatted F16 table row. + """ + # Get F16 result from results dict + f16_result = results.get(QuantisationType.F16) + + # Get file size + f16_size = "-" + if f16_result and hasattr(f16_result, "file_size"): + f16_size = f16_result.file_size or "-" + elif models_dir: + # Try to get from actual file + f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf" + f16_path = models_dir / model_source.model_name / f16_filename + if f16_path.exists(): + f16_size = FileSizeFormatter.get_file_size(f16_path) + + # Get status + status = "planned" + if f16_result and hasattr(f16_result, "status"): + status = f16_result.status + + return get_f16_row_template( + model_source.original_author, + model_source.model_name, + output_repo, + f16_size, + status, + ) diff --git a/helpers/readme/templates.py b/helpers/readme/templates.py new file mode 100644 index 0000000..2c1dfdc --- /dev/null +++ b/helpers/readme/templates.py @@ -0,0 +1,228 @@ +"""README templates for quantised models. + +Provides template strings and builders for generating README documentation. +""" + +from __future__ import annotations + + +def get_frontmatter_template( + licence: str, + base_model: str, + tags: list[str], +) -> str: + """Generate YAML frontmatter for README. + + Creates the YAML metadata header for HuggingFace model cards including + licence information, library specification, base model reference, and + tag listings formatted according to HuggingFace conventions. + + Returns: + Formatted YAML frontmatter string. + """ + frontmatter = f"""--- +license: {licence} +library_name: gguf +base_model: {base_model} +tags: +""" + for tag in tags: + if tag.strip(): + frontmatter += f"- {tag.strip()}\n" + + frontmatter += "---\n\n" + return frontmatter + + +def get_header_template( + original_author: str, + model_name: str, + source_model: str, +) -> str: + """Generate README header section. + + Creates the main header section with model title, description of the + quantisation process, and initial table structure for displaying + quantisation variants and their status information. + + Returns: + Formatted header markdown. + """ + hf_url = f"https://huggingface.co/{source_model}" + return f"""# {original_author}-{model_name}-GGUF + +GGUF quantisations of [{source_model}]({hf_url}) using +[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools) +which replicates Bartowski's quantisation profiles. + +| Variant | Configuration | Status | +|---|---|---| +""" + + +def get_downloads_section(download_instruction: str | None = None) -> str: + """Generate downloads and usage section. + + Creates comprehensive usage documentation including download instructions, + quick start examples for various runtimes (llama.cpp, Ollama, LM Studio), + and integration guidance with optional custom instructions. + + Returns: + Formatted downloads section markdown. + """ + base_section = """ +## šŸ“„ Download Links + +Direct download links are available for each quantisation in the table above. Click the āœ… status to +go to the file page. + +## šŸš€ Quick Start + +### Using llama.cpp + +```bash +# Download the model (replace Q4_K_M with your chosen quantisation) +wget https://huggingface.co/YOUR_REPO/resolve/main/model-Q4_K_M.gguf + +# Run with llama.cpp +./llama-cli -m model-Q4_K_M.gguf -p "Your prompt here" +``` + +### Using Ollama + +```bash +# Create Modelfile +echo "FROM ./model-Q4_K_M.gguf" > Modelfile + +# Create and run the model +ollama create mymodel -f Modelfile +ollama run mymodel +``` + +### Using LM Studio + +1. Open LM Studio +2. Click "Download Model" +3. Paste the HuggingFace repository URL +4. Select your preferred quantisation +5. Click Download + +""" + if download_instruction: + base_section = f"{download_instruction}\n\n{base_section}" + + return base_section + + +def get_quantisation_info() -> str: + """Get information about quantisation types. + + Returns: + Formatted quantisation information markdown. + """ + return """ +## šŸ“Š Quantisation Information + +### Bartowski Naming Convention + +- **L variants** (Q3_K_L, Q4_K_L, Q5_K_L): Uses Q8_0 for embeddings/output weights +- **M variants** (Q3_K_M, Q4_K_M, Q5_K_M): Standard K-quant configuration +- **XL variant** (Q3_K_XL): Q8_0 embeddings + Q6_K output weights +- **_L suffix** (Q6_K_L): Q8_0 for output.weight tensor + +### Recommended Quantisations + +- **Q4_K_M**: Best balance of quality and size (4.58GB for 7B model) +- **Q5_K_M**: Higher quality, larger size (5.33GB for 7B model) +- **Q3_K_L**: Smallest with good quality (3.35GB for 7B model) +- **Q6_K_L**: Near original quality (5.65GB for 7B model) +- **Q8_0**: Highest quality quantisation (7.17GB for 7B model) + +### Basic vs K-quants + +- **Basic types** (Q4_0, Q5_0, Q6_0, Q8_0): Simple quantisation, universally compatible +- **K-quants** (Q#_K_*): Advanced quantisation with better quality/size ratios + +Choose K-quants when available for better performance. Basic types are fallbacks for unsupported +architectures. +""" + + +def get_original_model_section( + original_readme: str, + separator: str = "---", +) -> str: + """Format original model documentation section. + + Formats the original model's documentation for inclusion in the + quantised model's README, preserving important context whilst + clearly separating it from the quantisation-specific information. + + Returns: + Formatted original model section. + """ + if not original_readme: + return "" + + return f""" +{separator} + +## Original Model Information + +{original_readme} +""" + + +def get_f16_row_template( + original_author: str, + model_name: str, + output_repo: str, + file_size: str = "-", + status: str = "completed", +) -> str: + """Generate F16 GGUF row for the table. + + Creates a formatted table row for the F16 reference model with + appropriate status indicators, download links, and file size + information based on upload status and availability. + + Returns: + Formatted table row for F16. + """ + filename = f"{original_author}-{model_name}-f16.gguf" + url = f"https://huggingface.co/{output_repo}/blob/main/{filename}" + + if status == "uploading": + status_text = f"ā¬†ļø Uploading... ({file_size})" + elif status == "completed": + status_text = f"[āœ… {file_size}]({url})" + else: + status_text = "ā³ Queued" + + return f"| **F16** | Full precision reference | {status_text} |\n" + + +def get_troubleshooting_section() -> str: + """Get troubleshooting section for README. + + Returns: + Formatted troubleshooting markdown. + """ + return """ +## šŸ”§ Troubleshooting + +### File Not Found +- Ensure you're using the correct repository URL +- Check that the quantisation has completed (āœ… status) +- Try refreshing the page if recently uploaded + +### Performance Issues +- Use smaller quantisations for limited RAM/VRAM +- Q4_K_M offers the best balance for most users +- Enable GPU acceleration if available + +### Compatibility +- K-quants require llama.cpp or compatible runtime +- Basic types (Q4_0, Q5_0, etc.) work with all runtimes +- Check your runtime's documentation for supported types +""" diff --git a/helpers/services/__init__.py b/helpers/services/__init__.py deleted file mode 100644 index 5b59db9..0000000 --- a/helpers/services/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Service layer for llm-gguf-tools. - -Provides high-level service interfaces for interacting with external systems -including HuggingFace, llama.cpp, and filesystem operations. Uses UK English -spelling conventions throughout. -""" diff --git a/helpers/services/gguf.py b/helpers/services/gguf.py deleted file mode 100644 index c9ccf80..0000000 --- a/helpers/services/gguf.py +++ /dev/null @@ -1,478 +0,0 @@ -"""GGUF file operations service. - -Provides unified interface for creating, writing, and manipulating GGUF files. -Consolidates GGUF-specific operations from conversion and quantisation workflows. -Uses UK English spelling conventions throughout. -""" - -from __future__ import annotations - -import gc -import json -import traceback -from pathlib import Path -from typing import TYPE_CHECKING, Any, Protocol - -import gguf -import torch -from safetensors import safe_open - -from helpers.logger import logger -from helpers.services.filesystem import FilesystemService -from helpers.utils.config_parser import ConfigParser - - -class VisionConfig(Protocol): - """Protocol for vision model configuration.""" - - hidden_size: int - num_hidden_layers: int - num_attention_heads: int - intermediate_size: int - patch_size: int - spatial_merge_size: int - - -class TensorMapper(Protocol): - """Protocol for tensor name mapping.""" - - def map_tensor_name(self, name: str) -> str | None: - """Map a tensor name to its GGUF equivalent.""" - - -if TYPE_CHECKING: - import numpy as np - - from helpers.models.conversion import ModelConfig - - -class GGUFWriter: - """Manages GGUF file creation and metadata writing. - - Provides high-level interface for GGUF file operations including metadata - configuration, tensor addition, and tokeniser integration. Encapsulates - low-level GGUF library interactions for consistent error handling. - """ - - def __init__(self, output_path: Path, architecture: str) -> None: - """Initialise GGUF writer with output path and architecture. - - Creates the underlying GGUF writer instance and prepares for metadata - and tensor addition. Sets up the file structure for the specified - model architecture. - """ - self.output_path = output_path - self.architecture = architecture - self.writer = gguf.GGUFWriter(str(output_path), architecture) - logger.info(f"Created GGUF writer for {architecture} architecture") - - def add_metadata(self, model_config: ModelConfig, model_name: str) -> None: - """Add comprehensive metadata from model configuration. - - Writes general model information, architectural parameters, and - quantisation settings to the GGUF file header. Handles both standard - and vision model configurations with appropriate parameter mapping. - """ - # General metadata - self.writer.add_name(model_name) - self.writer.add_description(f"Converted from {model_config.architectures[0]}") - self.writer.add_file_type(gguf.LlamaFileType.ALL_F32) - - # Log architecture being used - logger.info(f"Setting GGUF architecture: {self.architecture}") - if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}: - logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp") - - # Model parameters from config - params = model_config.to_gguf_params() - self.writer.add_context_length(params.context_length) - self.writer.add_embedding_length(params.embedding_length) - self.writer.add_block_count(params.block_count) - self.writer.add_feed_forward_length(params.feed_forward_length) - self.writer.add_head_count(params.attention_head_count) - self.writer.add_head_count_kv(params.attention_head_count_kv) - self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon) - self.writer.add_rope_freq_base(params.rope_freq_base) - self.writer.add_rope_dimension_count(params.rope_dimension_count) - - logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context") - - def add_vision_metadata(self, vision_config: VisionConfig | None) -> None: - """Add vision model parameters to GGUF metadata. - - Configures vision-specific parameters for multimodal models including - embedding dimensions, attention heads, and spatial processing settings. - """ - if not vision_config: - return - - logger.info("Adding vision model parameters...") - self.writer.add_vision_embedding_length(vision_config.hidden_size) - self.writer.add_vision_block_count(vision_config.num_hidden_layers) - self.writer.add_vision_head_count(vision_config.num_attention_heads) - self.writer.add_vision_feed_forward_length(vision_config.intermediate_size) - self.writer.add_vision_patch_size(vision_config.patch_size) - self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size) - - if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps: - self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps) - - def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None: - """Add tokeniser metadata to GGUF file. - - Writes special token IDs and tokeniser model type to enable proper - text processing during inference. Uses sensible defaults for missing - configuration values. - """ - self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1)) - self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2)) - self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0)) - self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0)) - - # Add BOS/EOS token addition flags if available - if "add_bos_token" in tokeniser_config: - self.writer.add_add_bos_token(tokeniser_config["add_bos_token"]) - if "add_eos_token" in tokeniser_config: - self.writer.add_add_eos_token(tokeniser_config["add_eos_token"]) - - # Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type - - logger.info("Added tokeniser configuration") - - def add_tokeniser_vocabulary(self, model_path: Path) -> None: - """Add full tokeniser vocabulary to GGUF file. - - Loads and embeds the complete tokeniser vocabulary including tokens, - merges, and scores to enable standalone model usage without external - tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers. - """ - tokenizer_path = model_path / "tokenizer.json" - if not tokenizer_path.exists(): - logger.warning("tokenizer.json not found, skipping vocabulary embedding") - return - - try: - with Path(tokenizer_path).open(encoding="utf-8") as f: - tokenizer_data = json.load(f) - - model_data = tokenizer_data.get("model", {}) - model_type = model_data.get("type", "") - - # Get pre-tokenizer information - pre_tokenizer = tokenizer_data.get("pre_tokenizer", {}) - pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer) - - # Get added tokens - added_tokens = tokenizer_data.get("added_tokens", []) - - if model_type == "BPE": - self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type) - elif model_type == "Unigram": - self._add_unigram_tokenizer(model_data, added_tokens) - elif model_type == "WordPiece": - self._add_wordpiece_tokenizer(model_data, added_tokens) - else: - logger.warning(f"Unsupported tokenizer type: {model_type}") - # Try to add as generic tokenizer - self._add_generic_tokenizer(model_data, tokenizer_data) - - except Exception as e: - logger.error(f"Failed to load tokeniser vocabulary: {e}") - logger.error(traceback.format_exc()) - - def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str: - """Determine pre-tokenizer type from configuration. - - Returns: - Pre-tokenizer type. - """ - if not pre_tokenizer: - return "default" - - # Check for various pre-tokenizer types - pre_type = pre_tokenizer.get("type", "") - if "ByteLevel" in str(pre_type): - return "llama3" - if "Metaspace" in str(pre_type): - return "default" - - return "default" - - def _add_bpe_tokenizer( - self, model_data: dict[str, Any], added_tokens: list[dict[str, Any]], pre_type: str - ) -> None: - """Add BPE tokenizer vocabulary to GGUF.""" - vocab = model_data.get("vocab", {}) - merges = model_data.get("merges", []) - - if not vocab: - logger.warning("No vocabulary found in BPE tokenizer") - return - - # Create token list sorted by index - max_idx = max(vocab.values()) if vocab else 0 - tokens = [""] * (max_idx + 1) - - for token, idx in vocab.items(): - if 0 <= idx < len(tokens): - tokens[idx] = token - - # Handle added tokens - for added_token in added_tokens: - token_id = added_token.get("id") - content = added_token.get("content") - if token_id is not None and content is not None: - if token_id >= len(tokens): - tokens.extend([""] * (token_id - len(tokens) + 1)) - tokens[token_id] = content - - # Prepare token types - token_types = [] - for i, _token in enumerate(tokens): - # Check if it's a special/control token - is_special = any( - added_token.get("id") == i and added_token.get("special", False) - for added_token in added_tokens - ) - if is_special: - token_types.append(gguf.TokenType.CONTROL) - else: - token_types.append(gguf.TokenType.NORMAL) - - # Add to GGUF - self.writer.add_tokenizer_model("gpt2") - self.writer.add_tokenizer_pre(pre_type) - self.writer.add_token_list(tokens) - self.writer.add_token_scores([0.0] * len(tokens)) - self.writer.add_token_types(token_types) - - if merges: - self.writer.add_token_merges(merges) - logger.info(f"Added {len(merges)} BPE merges") - - logger.info(f"Successfully embedded BPE tokeniser ({len(tokens)} tokens)") - - def _add_unigram_tokenizer( - self, - model_data: dict[str, Any], - added_tokens: list[dict[str, Any]], # noqa: ARG002 - ) -> None: - """Add Unigram/SentencePiece tokenizer to GGUF.""" - vocab = model_data.get("vocab", []) - if not vocab: - logger.warning("No vocabulary found in Unigram tokenizer") - return - - tokens = [] - scores = [] - token_types = [] - - # Process regular vocabulary - for item in vocab: - if isinstance(item, list) and len(item) >= 2: - token = item[0] - score = float(item[1]) if len(item) > 1 else 0.0 - tokens.append(token) - scores.append(score) - - # Determine token type - if token.startswith("<") and token.endswith(">"): - token_types.append(gguf.TokenType.CONTROL) - elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"): - token_types.append(gguf.TokenType.BYTE) - else: - token_types.append(gguf.TokenType.NORMAL) - - # Add to GGUF - self.writer.add_tokenizer_model("llama") - self.writer.add_tokenizer_pre("default") - self.writer.add_token_list(tokens) - self.writer.add_token_scores(scores) - self.writer.add_token_types(token_types) - - logger.info(f"Successfully embedded Unigram tokeniser ({len(tokens)} tokens)") - - def _add_wordpiece_tokenizer( - self, - model_data: dict[str, Any], - added_tokens: list[dict[str, Any]], # noqa: ARG002 - ) -> None: - """Add WordPiece tokenizer to GGUF.""" - vocab = model_data.get("vocab", {}) - if not vocab: - logger.warning("No vocabulary found in WordPiece tokenizer") - return - - # Create token list sorted by index - max_idx = max(vocab.values()) if vocab else 0 - tokens = [""] * (max_idx + 1) - - for token, idx in vocab.items(): - if 0 <= idx < len(tokens): - tokens[idx] = token - - # Token types (all normal for WordPiece) - token_types = [gguf.TokenType.NORMAL] * len(tokens) - - # Add to GGUF - self.writer.add_tokenizer_model("bert") - self.writer.add_tokenizer_pre("default") - self.writer.add_token_list(tokens) - self.writer.add_token_scores([0.0] * len(tokens)) - self.writer.add_token_types(token_types) - - logger.info(f"Successfully embedded WordPiece tokeniser ({len(tokens)} tokens)") - - def _add_generic_tokenizer( - self, - model_data: dict[str, Any], - tokenizer_data: dict[str, Any], # noqa: ARG002 - ) -> None: - """Try to add a generic tokenizer based on available data.""" - vocab = model_data.get("vocab") - if not vocab: - logger.warning("Cannot extract vocabulary from unknown tokenizer type") - return - - # Try to extract tokens in a generic way - tokens = [] - if isinstance(vocab, dict): - # Dictionary-style vocab - max_idx = max(vocab.values()) if vocab else 0 - tokens = [""] * (max_idx + 1) - for token, idx in vocab.items(): - if 0 <= idx < len(tokens): - tokens[idx] = token - elif isinstance(vocab, list): - # List-style vocab - for item in vocab: - if isinstance(item, str): - tokens.append(item) - elif isinstance(item, list) and len(item) > 0: - tokens.append(item[0]) - - if tokens: - self.writer.add_tokenizer_model("llama") # Default to llama - self.writer.add_tokenizer_pre("default") - self.writer.add_token_list(tokens) - self.writer.add_token_scores([0.0] * len(tokens)) - self.writer.add_token_types([gguf.TokenType.NORMAL] * len(tokens)) - logger.info(f"Added generic tokeniser ({len(tokens)} tokens)") - else: - logger.warning("Could not extract tokens from unknown tokenizer format") - - def add_tensor(self, name: str, data: np.ndarray) -> None: - """Add a tensor to the GGUF file. - - Writes tensor data with the specified name to the file. Handles - data type conversions and validates tensor shapes. - """ - self.writer.add_tensor(name, data) - - def finalise(self) -> None: - """Write all data to file and close writer. - - Completes the GGUF file creation by writing headers, key-value data, - and tensor data in the correct order. Ensures proper file closure. - """ - logger.info(f"Writing GGUF file to {self.output_path}") - self.writer.write_header_to_file() - self.writer.write_kv_data_to_file() - self.writer.write_tensors_to_file() - self.writer.close() - logger.info("GGUF file written successfully") - - -class GGUFConverter: - """High-level GGUF conversion orchestrator. - - Coordinates the complete conversion workflow from source models to GGUF - format, managing metadata extraction, tensor mapping, and file writing. - """ - - @staticmethod - def convert_safetensors( - model_path: Path, - output_path: Path, - model_config: ModelConfig, - architecture: str, - tensor_mapper: TensorMapper, - ) -> bool: - """Convert SafeTensors model to GGUF format. - - Orchestrates the conversion process including metadata setup, tensor - loading with BFloat16 support, name mapping, and tokeniser integration. - - Returns: - True if conversion successful, False otherwise. - """ - logger.info(f"Converting {model_path.name} to GGUF...") - - # Create writer - writer_wrapper = GGUFWriter(output_path, architecture) - - # Add metadata - writer_wrapper.add_metadata(model_config, model_path.name) - - # Add vision metadata if present - if model_config.vision_config: - writer_wrapper.add_vision_metadata(model_config.vision_config) - - # Load and add tensors - fs = FilesystemService() - tensor_files = fs.find_safetensor_files(model_path) - logger.info(f"Found {len(tensor_files)} tensor file(s)") - - tensor_count = 0 - for tensor_file in tensor_files: - logger.info(f"Loading {tensor_file.name}...") - with safe_open(tensor_file, framework="pt") as f: - for tensor_name in f.keys(): # noqa: SIM118 - tensor_data = f.get_tensor(tensor_name) - - # Convert BFloat16 to Float32 - if hasattr(tensor_data, "numpy"): - if torch and tensor_data.dtype == torch.bfloat16: - tensor_data = tensor_data.float() - tensor_data = tensor_data.numpy() - - # Map tensor name - gguf_name = tensor_mapper.map_tensor_name(tensor_name) - - if gguf_name: - writer_wrapper.add_tensor(gguf_name, tensor_data) - tensor_count += 1 - - if tensor_count % 100 == 0: - logger.info(f" Processed {tensor_count} tensors...") - - # Free memory after processing each tensor - del tensor_data - - # Force garbage collection after processing each file - gc.collect() - - logger.info(f"Total tensors processed: {tensor_count}") - - # Add tokeniser configuration - try: - tok_config = ConfigParser.load_tokeniser_config(model_path) - writer_wrapper.add_tokeniser(tok_config) - logger.info("Tokeniser configuration added") - except Exception as e: - logger.warning(f"Could not add tokeniser configuration: {e}") - - # Add tokeniser vocabulary (critical for standalone usage) - try: - writer_wrapper.add_tokeniser_vocabulary(model_path) - except Exception as e: - logger.error(f"Failed to embed tokeniser vocabulary: {e}") - logger.error("Model will not work without external tokeniser files!") - - # Finalise file - writer_wrapper.finalise() - - file_size = fs.get_file_size(output_path) - logger.info(f"Conversion complete! Output: {output_path} ({file_size})") - - return True diff --git a/helpers/services/huggingface.py b/helpers/services/huggingface.py deleted file mode 100644 index 9793caa..0000000 --- a/helpers/services/huggingface.py +++ /dev/null @@ -1,744 +0,0 @@ -"""HuggingFace operations service. - -Handles all interactions with HuggingFace including model downloads, -uploads, README generation, and repository management. Uses UK English -spelling conventions throughout. -""" - -from __future__ import annotations - -import json -import re -import shutil -import subprocess -import tempfile -from pathlib import Path -from types import SimpleNamespace -from typing import TYPE_CHECKING - -from helpers.config.quantisation_configs import QUANTISATION_CONFIGS -from helpers.logger import logger -from helpers.models.quantisation import QuantisationType -from helpers.utils.config_parser import ConfigParser - -if TYPE_CHECKING: - from helpers.models.quantisation import ModelSource, QuantisationResult - -# Constants for file size formatting -GIBIBYTE = 1024**3 - - -class HuggingFaceService: - """Manages HuggingFace repository operations. - - Provides methods for downloading models, uploading files, and managing - repositories. Handles authentication, error recovery, and progress tracking - for robust interaction with HuggingFace services. - """ - - @staticmethod - def get_username() -> str: - """Get authenticated HuggingFace username. - - Retrieves the current user's HuggingFace username using the CLI. - Requires prior authentication via `huggingface-cli login`. - - Returns: - HuggingFace username. - - Raises: - RuntimeError: If not authenticated or CLI not available. - """ - try: - result = subprocess.run( - ["huggingface-cli", "whoami"], - capture_output=True, - text=True, - check=True, - ) - return result.stdout.strip() - except (subprocess.CalledProcessError, FileNotFoundError) as err: - msg = "Please log in to HuggingFace first: huggingface-cli login" - raise RuntimeError(msg) from err - - @staticmethod - def download_model( - model_name: str, output_dir: Path, include_pattern: str | None = None - ) -> None: - """Download model from HuggingFace. - - Downloads a complete model or specific files matching a pattern. - Creates the output directory if it doesn't exist. Supports filtered - downloads for efficient bandwidth usage when only certain files are needed. - """ - logger.info(f"Downloading {model_name} to {output_dir}") - - cmd = [ - "huggingface-cli", - "download", - model_name, - "--local-dir", - str(output_dir), - ] - - if include_pattern: - cmd.extend(["--include", include_pattern]) - - subprocess.run(cmd, check=True, capture_output=True, text=True) - logger.info("Download complete") - - @staticmethod - def upload_file( - repo_id: str, - local_path: Path, - repo_path: str | None = None, - create_repo: bool = False, - ) -> None: - """Upload a file to HuggingFace repository. - - Uploads a single file to the specified repository path. Can create - the repository if it doesn't exist. Uses git directly when possible - to avoid automatic PR creation. - - Raises: - CalledProcessError: If upload fails. - """ - repo_path = repo_path or local_path.name - logger.info(f"Uploading {local_path.name} to {repo_id}/{repo_path}") - - # Try git-based upload first to avoid PR creation - if HuggingFaceService._try_git_upload( - repo_id, local_path, repo_path, create_repo=create_repo - ): - logger.info(f"Uploaded {repo_path} via git") - return - - # Fallback to huggingface-cli - logger.info("Git upload failed, trying huggingface-cli...") - cmd = [ - "huggingface-cli", - "upload", - repo_id, - str(local_path), - repo_path, - "--revision", - "main", # Explicitly push to main branch - "--commit-message", - f"Add {repo_path}", - ] - - if create_repo: - cmd.append("--create") - - try: - subprocess.run(cmd, check=True, capture_output=True) - logger.info(f"Uploaded {repo_path}") - except subprocess.CalledProcessError: - if create_repo: - # Repository might already exist, retry without --create - cmd = cmd[:-1] # Remove --create flag - subprocess.run(cmd, check=True, capture_output=True, text=True) - logger.info(f"Updated {repo_path}") - else: - raise - - @staticmethod - def _try_git_upload( - repo_id: str, - local_path: Path, - repo_path: str, - *, - create_repo: bool = False, - ) -> bool: - """Try to upload file using git directly to avoid PR creation. - - Returns: - bool: True if upload successful, False if should fallback to CLI. - """ - try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - repo_url = f"https://huggingface.co/{repo_id}" - - # Clone repository - logger.info(f"Cloning {repo_url}...") - result = subprocess.run( - ["git", "clone", repo_url, str(temp_path / "repo")], - check=False, - capture_output=True, - text=True, - ) - - if result.returncode != 0: - if create_repo: - # Repository doesn't exist, let huggingface-cli handle creation - return False - logger.warning(f"Clone failed: {result.stderr}") - return False - - repo_dir = temp_path / "repo" - target_file = repo_dir / repo_path - - # Ensure target directory exists - target_file.parent.mkdir(parents=True, exist_ok=True) - - # Copy file - shutil.copy2(local_path, target_file) - - # Check if there are any changes - status_result = subprocess.run( - ["git", "status", "--porcelain"], - cwd=repo_dir, - capture_output=True, - text=True, - check=True, - ) - - if not status_result.stdout.strip(): - logger.info(f"No changes detected for {repo_path}, file already up-to-date") - return True # File is already up-to-date, no need to push - - # Git add, commit, push - subprocess.run( - ["git", "add", repo_path], - cwd=repo_dir, - check=True, - capture_output=True, - text=True, - ) - subprocess.run( - ["git", "commit", "-m", f"Update {repo_path}"], - cwd=repo_dir, - check=True, - capture_output=True, - text=True, - ) - subprocess.run( - ["git", "push"], - cwd=repo_dir, - check=True, - capture_output=True, - text=True, - ) - - return True - - except subprocess.CalledProcessError as e: - logger.warning(f"Git upload failed: {e}") - return False - except Exception as e: - logger.warning(f"Git upload error: {e}") - return False - - -class ReadmeGenerator: - """Generates README files for quantised models. - - Creates comprehensive README documentation including model cards, - quantisation details, and status tracking. Supports both initial - planning documentation and final result summaries. - """ - - def generate( - self, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - models_dir: Path, - output_repo: str | None = None, - ) -> Path: - """Generate README file for quantised model repository. - - Creates a comprehensive README with frontmatter, quantisation table, - and original model information. Handles status tracking for planned, - processing, and completed quantisations. - - Returns: - Path to generated README file. - """ - logger.info("Creating model card...") - - model_dir = models_dir / model_source.model_name - readme_path = model_dir / "README.md" - - # Get original README content - original_content = self._get_original_readme(model_source, model_dir) - - # Get architecture from config.json - architecture = self._get_architecture(model_dir) - - # Generate new README - readme_content = self._generate_readme_content( - model_source, results, original_content, output_repo, architecture, models_dir - ) - - readme_path.write_text(readme_content) - return readme_path - - def _get_architecture(self, model_dir: Path) -> str | None: - """Get the architecture from the model's config.json. - - Returns: - Architecture name or None if not found. - """ - config_path = model_dir / "config.json" - if not config_path.exists(): - return None - - try: - with config_path.open(encoding="utf-8") as f: - config = json.load(f) - - # Get the architectures field - it's a list - architectures = config.get("architectures", []) - if architectures: - arch_name = architectures[0] - - # Get the mapped architecture (what it will be converted to) - parser = ConfigParser() - mapped_arch = parser.get_architecture_mapping(arch_name) - - logger.info(f"Architecture: {arch_name} -> {mapped_arch}") - return mapped_arch - except Exception as e: - logger.warning(f"Could not determine architecture: {e}") - - return None - - def _get_original_readme(self, model_source: ModelSource, model_dir: Path) -> dict[str, str]: - """Extract original README and metadata. - - Downloads or reads the original model's README for inclusion in the - quantised model documentation. Parses YAML frontmatter if present. - - Returns: - Dictionary with readme content, licence, tags, and frontmatter. - """ - content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""} - - # Check for preserved original README first - original_readme_path = model_dir / "README.original.md" - readme_path = model_dir / "README.md" - - if original_readme_path.exists(): - # Use the preserved original - content["readme"] = original_readme_path.read_text(encoding="utf-8") - logger.info(f"Found preserved original README ({len(content['readme'])} characters)") - elif readme_path.exists(): - # First time - preserve the original and use it - readme_content = readme_path.read_text(encoding="utf-8") - - # Check if this is already our generated README - if ( - f"{model_source.original_author}-{model_source.model_name}-GGUF" - not in readme_content - ): - # This is the original - preserve it - original_readme_path.write_text(readme_content, encoding="utf-8") - content["readme"] = readme_content - readme_len = len(content["readme"]) - logger.info( - f"Preserved original README as README.original.md ({readme_len} characters)" - ) - else: - # This is our generated README, need to download the original - logger.info("Found generated README, downloading original from source") - content = self._download_readme(model_source) - # Save the downloaded original for future use - if content["readme"]: - original_readme_path.write_text(content["readme"], encoding="utf-8") - logger.info("Preserved downloaded original README as README.original.md") - else: - # No local README - download from source - content = self._download_readme(model_source) - # Save the downloaded original for future use - if content["readme"]: - original_readme_path.write_text(content["readme"], encoding="utf-8") - logger.info("Preserved downloaded original README as README.original.md") - - # Parse frontmatter if present - if content["readme"].startswith("---\n"): - content = self._parse_frontmatter(content["readme"]) - - return content - - def _download_readme(self, model_source: ModelSource) -> dict[str, str]: - """Download README from HuggingFace repository. - - Attempts to download just the README.md file from the source repository - for efficient documentation extraction. - - Returns: - Dictionary with readme content and default metadata. - """ - content = {"readme": "", "licence": "apache-2.0", "tags": "", "frontmatter": ""} - - with tempfile.TemporaryDirectory() as temp_dir: - try: - logger.info(f"Downloading README from {model_source.source_model}...") - subprocess.run( - [ - "huggingface-cli", - "download", - model_source.source_model, - "--include", - "README.md", - "--local-dir", - temp_dir, - ], - check=True, - capture_output=True, - ) - - readme_path = Path(temp_dir) / "README.md" - if readme_path.exists(): - content["readme"] = readme_path.read_text(encoding="utf-8") - logger.info(f"Downloaded README ({len(content['readme'])} characters)") - except subprocess.CalledProcessError as e: - logger.warning(f"Failed to download README: {e}") - - return content - - def _parse_frontmatter(self, readme_text: str) -> dict[str, str]: - """Parse YAML frontmatter from README. - - Extracts metadata from YAML frontmatter including licence, tags, - and other model card fields. - - Returns: - Dictionary with separated content and metadata. - """ - lines = readme_text.split("\n") - if lines[0] != "---": - return { - "readme": readme_text, - "licence": "apache-2.0", - "tags": "", - "frontmatter": "", - } - - frontmatter_end = -1 - for i, line in enumerate(lines[1:], 1): - if line == "---": - frontmatter_end = i - break - - if frontmatter_end == -1: - return { - "readme": readme_text, - "licence": "apache-2.0", - "tags": "", - "frontmatter": "", - } - - frontmatter = "\n".join(lines[1:frontmatter_end]) - content = "\n".join(lines[frontmatter_end + 1 :]) - - # Extract licence - licence_match = re.search(r"^license:\s*(.+)$", frontmatter, re.MULTILINE) - licence_val = licence_match.group(1).strip().strip('"') if licence_match else "apache-2.0" - - # Extract tags - tags = [] - in_tags = False - for line in frontmatter.split("\n"): - if line.startswith("tags:"): - in_tags = True - continue - if in_tags: - if line.startswith("- "): - tags.append(line[2:].strip()) - elif line and not line.startswith(" "): - break - - return { - "readme": content, - "licence": licence_val, - "tags": ",".join(tags), - "frontmatter": frontmatter, - } - - def _generate_readme_content( - self, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - original_content: dict[str, str], - output_repo: str | None = None, - architecture: str | None = None, - models_dir: Path | None = None, - ) -> str: - """Generate complete README content with quantisation details. - - Creates the full README including YAML frontmatter, quantisation status - table, and original model information. - - Returns: - Complete README markdown content. - """ - # Build tags based on actual successful quantisations - our_tags = ["gguf"] - - # Add tags for successful quantisations only - for quant_type, result in results.items(): - if hasattr(result, "status") and result.status == "completed": - if quant_type == "F16": - our_tags.append("f16") - elif hasattr(result, "quantisation_type"): - # Convert to lowercase tag format (e.g., Q3_K_M -> q3_k_m) - our_tags.append(result.quantisation_type.value.lower()) - - # If no quantisations succeeded but F16 is available, still add basic tags - if ( - len(our_tags) == 1 - and QuantisationType.F16 in results - and hasattr(results[QuantisationType.F16], "status") - and results[QuantisationType.F16].status in {"completed", "uploading"} - ): - our_tags.append("f16") - - original_tags = original_content["tags"].split(",") if original_content["tags"] else [] - all_tags = sorted(set(our_tags + original_tags)) - - # Build frontmatter - frontmatter = f"""--- -license: {original_content["licence"]} -library_name: gguf -base_model: {model_source.source_model} -tags: -""" - for tag in all_tags: - if tag.strip(): - frontmatter += f"- {tag.strip()}\n" - - frontmatter += "---\n\n" - - # Build main content - hf_url = f"https://huggingface.co/{model_source.source_model}" - content = f"""# {model_source.original_author}-{model_source.model_name}-GGUF - -GGUF quantisations of [{model_source.source_model}]({hf_url}) using -[Bartowski](https://huggingface.co/bartowski)'s method. Created with [llm-gguf-tools](https://git.tomfos.tr/tom/llm-gguf-tools) -which replicates Bartowski's quantisation profiles. - -| Variant | Configuration | Status | -|---|---|---| -""" - - # Add results table - properly sorted by precision and type - # Order: Q3 K-quants, Q4 basic, Q4 K-quants, Q5 basic, Q5 K-quants, etc. - ordered_types = [ - # Q3 K-quants - QuantisationType.Q3_K_M, - QuantisationType.Q3_K_L, - QuantisationType.Q3_K_XL, - # Q4 types - QuantisationType.Q4_0, # Basic - QuantisationType.Q4_K_M, - QuantisationType.Q4_K_L, - # Q5 types - QuantisationType.Q5_0, # Basic - QuantisationType.Q5_K_M, - QuantisationType.Q5_K_L, - # Q6 types - QuantisationType.Q6_0, # Basic - QuantisationType.Q6_K, - QuantisationType.Q6_K_L, - # Q8 types - QuantisationType.Q8_0, # Basic - QuantisationType.Q8_K, - ] - - for quant_type in ordered_types: - result_temp = results.get(quant_type) - if result_temp is None: - result = SimpleNamespace(status="planned", success=False) # type: ignore[assignment] - else: - result = result_temp - - config = QUANTISATION_CONFIGS.get(quant_type) - status = self._format_status(result, model_source, quant_type, output_repo) - - # Get configuration description from the config itself - config_desc = ( - config.get_compact_config(QUANTISATION_CONFIGS) - if config - else f"{quant_type} all layers" - ) - - content += f"| **{quant_type.value}** | {config_desc} | {status} |\n" - - # Add F16 row at the bottom if we converted from SafeTensors - # Note: Named "f16" for compatibility, but contains mixed F16/F32 tensors - # (BF16 source tensors are converted to F32 to preserve precision) - if not model_source.is_gguf_repo and output_repo: - f16_filename = f"{model_source.original_author}-{model_source.model_name}-f16.gguf" - f16_url = f"https://huggingface.co/{output_repo}/blob/main/{f16_filename}" - - # Get F16 result from results dict (if tracking it) - f16_result = results.get(QuantisationType.F16) - - # Get file size - f16_size = "-" - if f16_result and hasattr(f16_result, "file_size"): - f16_size = f16_result.file_size or "-" - elif models_dir: - # Try to get from actual file - f16_path = models_dir / model_source.model_name / f16_filename - if f16_path.exists(): - size_bytes = f16_path.stat().st_size - size_gb = size_bytes / GIBIBYTE - f16_size = f"{size_gb:.1f}GB" - - # Format status based on upload state - if f16_result and hasattr(f16_result, "status"): - if f16_result.status == "uploading": - f16_status = f"ā¬†ļø Uploading... ({f16_size})" - elif f16_result.status == "completed": - f16_status = f"[āœ… {f16_size}]({f16_url})" - else: - f16_status = "ā³ Queued" - else: - # Default to available if no status tracking - f16_status = f"[āœ… {f16_size}]({f16_url})" - - content += f"| **F16** | Full precision GGUF (F16/F32 mixed) | {f16_status} |\n" - - content += """ - -**Key:** `E` = Embeddings, `O` = Output, `A` = Attention, `F` = FFN - -""" - - # Add warning for unsupported architectures - if architecture: - supported_archs = { - "llama", - "qwen2", - "gemma", - "phi3", - "falcon", - "gpt2", - "gptj", - "gptneox", - "mpt", - "baichuan", - "stablelm", - } - if architecture not in supported_archs: - content += ( - f"āš ļø **Note:** This model uses the `{architecture}` architecture, which is not " - "yet supported by llama.cpp for quantisation. If quantisations failed, this is " - "why - llama.cpp cannot quantise architectures it doesn't recognise. The F16 " - "GGUF file is provided as a full-precision fallback (requires ~2x model size " - f"in VRAM). For `{architecture}` support, check with your inference software " - "or wait for llama.cpp updates.\n\n" - ) - - content += ( - "See [Bartowski Analysis](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/" - "bartowski_analysis.md) for detailed quantisation strategies and " - "[Documentation](https://git.tomfos.tr/tom/llm-gguf-tools/src/branch/main/docs/) " - "for more on the tools and methods I use.\n\n" - ) - - # Add original content - if original_content["readme"]: - content += "## Original Model Card\n\n---\n\n" + original_content["readme"] - else: - content += f"## Original Model\n\nQuantisation of [{model_source.source_model}](https://huggingface.co/{model_source.source_model})." - - return frontmatter + content - - def _format_file_size(self, result: QuantisationResult) -> str: - """Format file size for README table. - - Returns: - Formatted file size string or dash if not available. - """ - if hasattr(result, "file_size") and result.file_size: - return result.file_size - if hasattr(result, "success") and result.success and hasattr(result, "file_path"): - # Try to get file size from path if available - try: - if result.file_path and Path(result.file_path).exists(): - size_bytes = Path(result.file_path).stat().st_size - size_gb = size_bytes / GIBIBYTE - return f"{size_gb:.1f}GB" - except Exception: - pass - return "-" - - def _format_status( - self, - result: QuantisationResult, - model_source: ModelSource, - quant_type: QuantisationType, - output_repo: str | None, - ) -> str: - """Format status indicator for README table. - - Creates appropriate status indicator based on quantisation state - including progress indicators, file sizes, and download links. - - Returns: - Formatted status string for table cell. - """ - status_map = { - "planned": "ā³ Queued", - "processing": "šŸ”„ Processing...", - "uploading": "ā¬†ļø Uploading...", - "failed": "āŒ Failed", - } - - if hasattr(result, "status") and result.status in status_map: - base_status = status_map[result.status] - - # Check for architecture not supported error - if ( - result.status == "failed" - and hasattr(result, "error_message") - and result.error_message - and "architecture not supported" in str(result.error_message).lower() - ): - return "āš ļø Skipped" - - if result.status == "uploading" and hasattr(result, "file_size") and result.file_size: - return f"{base_status} ({result.file_size})" - if result.status == "completed" or (hasattr(result, "success") and result.success): - return self._format_success_status(result, model_source, quant_type, output_repo) - return base_status - - # Legacy support - if hasattr(result, "success") and result.success: - return self._format_success_status(result, model_source, quant_type, output_repo) - return "āŒ Failed" - - def _format_success_status( - self, - result: QuantisationResult, - model_source: ModelSource, - quant_type: QuantisationType, - output_repo: str | None, - ) -> str: - """Format successful quantisation status with download link. - - Creates a download link if repository information is available, - otherwise shows file size. - - Returns: - Formatted success status string. - """ - if not output_repo: - return ( - f"āœ… {result.file_size}" - if hasattr(result, "file_size") and result.file_size - else "āœ… Available" - ) - - filename = ( - f"{model_source.original_author}-{model_source.model_name}-{quant_type.value}.gguf" - ) - url = f"https://huggingface.co/{output_repo}?show_file_info={filename}" - - if hasattr(result, "file_size") and result.file_size: - return f"[āœ… {result.file_size}]({url})" - return f"[āœ… Available]({url})" diff --git a/helpers/services/llama_cpp.py b/helpers/services/llama_cpp.py deleted file mode 100644 index 93783b3..0000000 --- a/helpers/services/llama_cpp.py +++ /dev/null @@ -1,295 +0,0 @@ -"""Direct llama.cpp binary execution service. - -Provides direct execution of llama.cpp quantisation binary with proper -tensor-specific override support for L and XL variants. -""" - -from __future__ import annotations - -import os -import platform -import subprocess -from pathlib import Path -from typing import TYPE_CHECKING - -from helpers.logger import logger -from helpers.services.binary_manager import BinaryManager -from helpers.services.filesystem import FilesystemService - -if TYPE_CHECKING: - from helpers.models.quantisation import QuantisationConfig - - -class QuantisationExecutor: - """Executes llama.cpp quantisation with tensor overrides. - - Provides direct binary execution with proper command-line flags for - tensor-specific overrides, supporting Bartowski-style L and XL variants. - """ - - def __init__(self) -> None: - """Initialise quantisation executor.""" - self.fs = FilesystemService() - self.binary_manager = BinaryManager() - self.quantise_binary = self._get_quantise_binary() - self.last_error: str | None = None # Track last error type - - def _get_quantise_binary(self) -> Path | None: - """Get llama-quantize binary, downloading if necessary. - - Returns: - Path to binary if found, None otherwise. - """ - # First check local directory for manual placement - local_binary = Path("./llama-quantize") - if local_binary.exists(): - logger.info(f"Using local llama-quantize binary: {local_binary}") - return local_binary - - # Download from GitHub releases - binary_path = self.binary_manager.get_quantise_binary() - if binary_path and self.binary_manager.check_binary_works(binary_path): - logger.info(f"Using llama-quantize binary: {binary_path}") - return binary_path - - logger.error("Failed to obtain llama-quantize binary") - logger.info( - "You can manually download it from: https://github.com/ggml-org/llama.cpp/releases" - ) - return None - - def execute_quantisation( - self, - input_path: Path, - output_path: Path, - config: QuantisationConfig, - imatrix_path: Path | None = None, - ) -> bool: - """Execute quantisation using llama.cpp binary. - - Builds and executes llama-quantize command with proper tensor override - flags for L and XL variants. - - Returns: - True if quantisation successful, False otherwise. - """ - if not self.quantise_binary: - logger.error("llama-quantize binary not available") - return False - - # Build command - cmd = self._build_quantisation_command(input_path, output_path, config, imatrix_path) - - # Execute with real-time output - return self._execute_command(cmd) - - def _build_quantisation_command( - self, - input_path: Path, - output_path: Path, - config: QuantisationConfig, - imatrix_path: Path | None, - ) -> list[str]: - """Build llama-quantize command with tensor overrides. - - Returns: - Command arguments as list. - """ - cmd = [str(self.quantise_binary)] - - # Add imatrix if available - if imatrix_path: - cmd.extend(["--imatrix", str(imatrix_path)]) - if imatrix_path.exists(): - logger.info(f"🧮 Using imatrix: {imatrix_path.name}") - - # Add tensor-specific overrides for L and XL variants - if config.embedding_type: - # Use directly from config - already in correct format - cmd.extend(["--token-embedding-type", config.embedding_type.lower()]) - logger.info(f"āš™ļø Token embedding type: {config.embedding_type}") - - if config.output_type: - # Use directly from config - already in correct format - cmd.extend(["--output-tensor-type", config.output_type.lower()]) - logger.info(f"āš™ļø Output tensor type: {config.output_type}") - - # Note: Per-layer tensor overrides could be added here if needed in future - # For now, embedding and output overrides handle the L/XL variants - - # Get base quantisation type - base_quant = self._get_base_quantisation_type(config.name) - - # Add input, output, and base quantisation type - cmd.extend([str(input_path), str(output_path), base_quant]) - - return cmd - - def _get_base_quantisation_type(self, config_name: str) -> str: - """Get base quantisation type for a config. - - Maps custom variants (Q3_K_L, Q3_K_XL) to their base types (Q3_K_M). - - Returns: - Base quantisation type string. - """ - # Mapping of custom variants to base types - variant_mapping = { - "Q3_K_L": "Q3_K_M", - "Q3_K_XL": "Q3_K_M", - "Q4_K_L": "Q4_K_M", - "Q4_K_XL": "Q4_K_M", - "Q5_K_L": "Q5_K_M", - "Q5_K_XL": "Q5_K_M", - "Q6_K_L": "Q6_K", - "Q6_K_XL": "Q6_K", - } - - return variant_mapping.get(config_name, config_name) - - def _execute_command(self, cmd: list[str]) -> bool: - """Execute command with real-time output streaming. - - Returns: - True if successful, False otherwise. - """ - logger.info(f"šŸ’» Running: {' '.join(cmd)}") - logger.info("ā³ Quantisation in progress... (this may take several minutes)") - - # Set LD_LIBRARY_PATH for shared libraries - env = os.environ.copy() - if platform.system() != "Windows": - lib_path = str(self.binary_manager.BINARY_DIR) - if "LD_LIBRARY_PATH" in env: - env["LD_LIBRARY_PATH"] = f"{lib_path}:{env['LD_LIBRARY_PATH']}" - else: - env["LD_LIBRARY_PATH"] = lib_path - - # Track output for architecture detection - output_lines = [] - architecture_error = False - - try: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - bufsize=1, - env=env, - ) - - # Stream output - while True: - if process.stdout is not None: - output = process.stdout.readline() - else: - break - if not output and process.poll() is not None: - break - if output: - output_stripped = output.strip() - logger.info(f"šŸ“Š {output_stripped}") - output_lines.append(output_stripped) - - # Check for architecture-related errors - if any( - phrase in output_stripped.lower() - for phrase in [ - "unsupported architecture", - "unknown architecture", - "architecture not supported", - "model architecture", - "llama_model_load: error loading model", - ] - ): - architecture_error = True - - return_code = process.poll() - if return_code == 0: - logger.info("āœ… Quantisation successful!") - return True - - # Check if this was an architecture error - if architecture_error or return_code == 1: - # Look for architecture info in recent output - for line in output_lines[-10:]: # Check last 10 lines - if "architecture" in line.lower(): - logger.error("āŒ Architecture not supported by llama.cpp") - logger.error(" so cannot be quantised with current llama.cpp but") - logger.error(" F16 GGUF file can be used for inference if supported") - # Store this for the orchestrator to detect - self.last_error = "unsupported_architecture" - return False - - logger.error(f"āŒ Quantisation failed with return code {return_code}") - - except Exception as e: - logger.error(f"āŒ Quantisation failed with exception: {e}") - return False - else: - return False - - -class IMatrixHandler: - """Handles importance matrix file management. - - Manages detection and use of existing importance matrix files for - quantisation guidance. - """ - - def __init__(self) -> None: - """Initialise IMatrixHandler.""" - self.fs = FilesystemService() - - def find_imatrix(self, model_dir: Path) -> Path | None: - """Find existing imatrix file in model directory. - - Returns: - Path to imatrix file if found, None otherwise. - """ - imatrix_path = model_dir / "imatrix.dat" - - if imatrix_path.exists(): - file_size = self.fs.get_file_size(imatrix_path) - logger.info(f"Found existing imatrix: {imatrix_path.name} ({file_size})") - return imatrix_path - - return None - - def prompt_for_user_imatrix(self, model_dir: Path) -> Path | None: - """Prompt user for existing imatrix file. - - Returns: - Path to user-provided imatrix, or None if not available. - """ - imatrix_path = model_dir / "imatrix.dat" - - logger.info(f"Model directory: {model_dir}") - logger.info(f"Looking for imatrix file at: {imatrix_path}") - logger.info( - "Tip: You can download pre-computed imatrix files from Bartowski's repositories!" - ) - logger.info( - " Example: https://huggingface.co/bartowski/MODEL-NAME-GGUF/resolve/main/MODEL-NAME.imatrix" - ) - - response = ( - input("\nā“ Do you have an imatrix file to place in the model directory? (y/N): ") - .strip() - .lower() - ) - - if response != "y": - return None - - logger.info(f"Please place your imatrix.dat file in: {model_dir}") - input("ā³ Press Enter when you've placed the imatrix.dat file (or Ctrl+C to cancel)...") - - if imatrix_path.exists(): - file_size = self.fs.get_file_size(imatrix_path) - logger.info(f"Found imatrix file! ({file_size})") - return imatrix_path - - logger.warning("No imatrix.dat file found - continuing without imatrix") - return None diff --git a/helpers/services/llama_python.py b/helpers/services/llama_python.py deleted file mode 100644 index b451af2..0000000 --- a/helpers/services/llama_python.py +++ /dev/null @@ -1,756 +0,0 @@ -"""Python API wrapper for llama-cpp-python quantisation operations. - -Provides high-level Python interfaces for model quantisation using llama-cpp-python -bindings. Implements partial tensor-specific quantisation support through embedding -and output tensor type configuration. -""" - -from __future__ import annotations - -import ctypes -import gc -import logging -import os -import signal -import sys -import traceback -from typing import TYPE_CHECKING, Any, ClassVar, Never - -import psutil - -from helpers.logger import logger -from helpers.services.gguf import GGUFConverter -from helpers.utils.config_parser import ConfigParser -from helpers.utils.tensor_mapping import TensorMapper - -if TYPE_CHECKING: - from pathlib import Path - - from helpers.models.quantisation import QuantisationConfig - -# Import llama_cpp when needed -try: - import llama_cpp - from llama_cpp import llama_model_quantize_params - - LLAMA_CPP_AVAILABLE = True -except ImportError: - LLAMA_CPP_AVAILABLE = False - logger.warning("llama-cpp-python not available - falling back to binary mode") - - -class LlamaCppPythonAPI: - """Python API wrapper for llama.cpp quantisation operations. - - Provides direct Python access to quantisation functionality using llama-cpp-python - bindings. Implements partial tensor-specific quantisation through token embedding - and output tensor type configuration, which provides differentiation between - Q4_K variants even without full per-layer tensor control. - """ - - # Mapping of custom variant prefixes to their base types - VARIANT_BASE_MAPPING: ClassVar[dict[str, str]] = { - "Q3_K_": "Q3_K_M", - "Q4_K_": "Q4_K_M", - "Q5_K_": "Q5_K_M", - "Q6_K_": "Q6_K", - } - - @staticmethod - def is_available() -> bool: - """Check if llama-cpp-python is available for use. - - Returns: - True if llama-cpp-python bindings are installed and functional. - """ - return LLAMA_CPP_AVAILABLE - - @staticmethod - def get_quantisation_type(config_name: str) -> int: - """Map configuration name to llama_cpp quantisation type constant. - - Supports a wide range of quantisation types from Q2 to Q8, including - K-quants and legacy formats. Handles both simple formats (Q4_K_M, Q6_K) - and custom suffixed variants (Q4_K_M_L, Q5_K_M_XL) by mapping them to - their base types for llama-cpp-python compatibility. - - Returns: - llama_cpp quantisation type constant for base quantisation. - - Raises: - RuntimeError: If llama-cpp-python is not available. - ValueError: If the quantisation type is not supported. - """ - if not LLAMA_CPP_AVAILABLE: - msg = "llama-cpp-python not available" - raise RuntimeError(msg) - - # Normalise the config name to extract base type - # e.g. "Q4_K_L" or "Q4_K_XL" -> "Q4_K_M" (default for Q4_K) - # e.g. "Q4_K_M_XXL" -> "Q4_K_M" - config_upper = config_name.upper() - - # Direct mapping for exact matches - type_mapping = { - # Q2 variants (not recommended but supported) - "Q2_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K, - "Q2_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q2_K_S, - # Q3 K-quants - "Q3_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_S, - "Q3_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q3_K_M, - # Q4 K-quants (most common) - "Q4_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_S, - "Q4_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_K_M, - # Q5 K-quants - "Q5_K_S": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_S, - "Q5_K_M": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_K_M, - # Q6_K (single variant) - "Q6_K": llama_cpp.LLAMA_FTYPE_MOSTLY_Q6_K, - # Q8_0 (highest common quantisation) - "Q8_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q8_0, - # Legacy quantisation formats - "Q4_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_0, - "Q4_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q4_1, - "Q5_0": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_0, - "Q5_1": llama_cpp.LLAMA_FTYPE_MOSTLY_Q5_1, - # IQ (Integer Quantisation) variants - experimental - "IQ2_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XXS, - "IQ2_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_XS, - "IQ2_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_S, - "IQ2_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ2_M, - "IQ3_XXS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XXS, - "IQ3_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_XS, - "IQ3_S": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_S, - "IQ3_M": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ3_M, - "IQ4_NL": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_NL, - "IQ4_XS": llama_cpp.LLAMA_FTYPE_MOSTLY_IQ4_XS, - # Higher precision formats - "F16": llama_cpp.LLAMA_FTYPE_MOSTLY_F16, - "BF16": llama_cpp.LLAMA_FTYPE_MOSTLY_BF16, - } - - # Try direct lookup first - if config_upper in type_mapping: - return type_mapping[config_upper] - - # Handle custom variants using base mapping - for prefix, base_type in LlamaCppPythonAPI.VARIANT_BASE_MAPPING.items(): - if config_upper.startswith(prefix) and config_upper not in type_mapping: - return type_mapping[base_type] - - # If not found, raise an informative error - supported = sorted(type_mapping.keys()) - msg = ( - f"Unsupported quantisation type: {config_name}\n" - f"Supported types: {', '.join(supported)}\n" - f"Custom variants like Q4_K_L, Q4_K_XL are also supported." - ) - raise ValueError(msg) - - @staticmethod - def get_tensor_type_value(type_name: str) -> int: - """Convert tensor type name to llama_cpp constant. - - Maps string tensor type names to their corresponding llama_cpp integer - constants for tensor-specific overrides. Provides the foundation for - differentiated quantisation strategies across embedding and output layers. - - Returns: - Integer value for the tensor type, or 0 if not found. - """ - if not LLAMA_CPP_AVAILABLE: - return 0 - - # Build mapping with variant consolidation - # All Q3_K variants map to base Q3_K type, same for Q4_K and Q5_K - type_mapping = LlamaCppPythonAPI._build_tensor_type_mapping() - return type_mapping.get(type_name.upper(), 0) - - @staticmethod - def _build_tensor_type_mapping() -> dict[str, int]: - """Build tensor type mapping with variant consolidation. - - Returns: - Dictionary mapping type names to GGML constants. - """ - if not LLAMA_CPP_AVAILABLE: - return {} - - # Base mappings - return { - # Q2 variants - "Q2_K": llama_cpp.GGML_TYPE_Q2_K, - # Q3 variants - all map to base Q3_K - "Q3_K": llama_cpp.GGML_TYPE_Q3_K, - "Q3_K_S": llama_cpp.GGML_TYPE_Q3_K, - "Q3_K_M": llama_cpp.GGML_TYPE_Q3_K, - "Q3_K_L": llama_cpp.GGML_TYPE_Q3_K, - # Q4 variants - "Q4_0": llama_cpp.GGML_TYPE_Q4_0, - "Q4_1": llama_cpp.GGML_TYPE_Q4_1, - "Q4_K": llama_cpp.GGML_TYPE_Q4_K, - "Q4_K_S": llama_cpp.GGML_TYPE_Q4_K, - "Q4_K_M": llama_cpp.GGML_TYPE_Q4_K, - # Q5 variants - "Q5_0": llama_cpp.GGML_TYPE_Q5_0, - "Q5_1": llama_cpp.GGML_TYPE_Q5_1, - "Q5_K": llama_cpp.GGML_TYPE_Q5_K, - "Q5_K_S": llama_cpp.GGML_TYPE_Q5_K, - "Q5_K_M": llama_cpp.GGML_TYPE_Q5_K, - # Q6 variant - "Q6_K": llama_cpp.GGML_TYPE_Q6_K, - # Q8 variant - "Q8_0": llama_cpp.GGML_TYPE_Q8_0, - # Higher precision - "F16": llama_cpp.GGML_TYPE_F16, - "F32": llama_cpp.GGML_TYPE_F32, - } - - def quantise_model_flexible( - self, - input_path: Path, - output_path: Path, - base_type: str, - embedding_type: str | None = None, - output_type: str | None = None, - imatrix_path: Path | None = None, - ) -> bool: - """Quantise model with flexible tensor type configuration. - - Provides control over base quantisation type with optional overrides for - embeddings and output layers, which are the only tensor-specific controls - that work reliably with llama-cpp-python. - - Args: - input_path: Path to input GGUF model. - output_path: Path for output quantised model. - base_type: Base quantisation type (e.g. "Q4_K_M", "Q6_K"). - embedding_type: Override for token embeddings (None = use base). - output_type: Override for output/lm_head layers (None = use base). - imatrix_path: Optional importance matrix file. - - Returns: - True if quantisation successful, False otherwise. - - Examples: - # Q4_K_L: Q4_K_M base with Q8_0 embeddings - api.quantise_model_flexible( - input_path, output_path, "Q4_K_M", - embedding_type="Q8_0" - ) - - # Q3_K_L: Q3_K_M base with Q5_K output - api.quantise_model_flexible( - input_path, output_path, "Q3_K_M", - output_type="Q5_K" - ) - - # Q3_K_XL: Q3_K_M with both Q8_0 embeddings and Q5_K output - api.quantise_model_flexible( - input_path, output_path, "Q3_K_M", - embedding_type="Q8_0", - output_type="Q5_K" - ) - - Raises: - RuntimeError: If llama-cpp-python is not available. - """ - if not LLAMA_CPP_AVAILABLE: - msg = "llama-cpp-python not available for quantisation" - raise RuntimeError(msg) - - logger.info(f"šŸ”„ Flexible quantisation: {base_type} base") - logger.info(f"šŸ“ Input: {input_path}") - logger.info(f"šŸ“ Output: {output_path}") - - # Setup phase - create and configure parameters - params = self._create_params(base_type, imatrix_path) - self._apply_tensor_overrides(params, embedding_type, output_type) - - # Execution phase - perform quantisation - try: - logger.debug("DEBUG: Starting flexible quantisation execution") - result = self._do_quantisation(input_path, output_path, params) - logger.debug(f"DEBUG: Flexible quantisation returned: {result}") - - except Exception as e: - logger.error(f"āŒ Flexible quantisation failed with exception: {e}") - logger.error("Flexible quantisation traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - return False - else: - if result == 0: - # Verify output file was created and is valid - if not output_path.exists(): - logger.error( - f"āŒ Quantisation claimed success but output does not exist: {output_path}" - ) - return False - - try: - output_size = output_path.stat().st_size - logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB") - - if output_size == 0: - logger.error("āŒ Output file is empty despite success code") - return False - except Exception as e: - logger.warning(f"āš ļø Could not check output file size: {e}") - - logger.info(f"āœ… Quantisation successful: {output_path.name}") - return True - logger.error(f"āŒ Quantisation failed with code: {result}") - return False - - def _create_params( - self, base_type: str, imatrix_path: Path | None - ) -> llama_model_quantize_params: - """Create quantisation parameters. - - Returns: - Configured quantisation parameters. - """ - params = llama_model_quantize_params() - params.ftype = self.get_quantisation_type(base_type) - params.nthread = 8 - params.allow_requantize = True - - if imatrix_path and imatrix_path.exists(): - # Convert path to bytes and create c_char_p, then cast to c_void_p - imatrix_bytes = str(imatrix_path).encode("utf-8") - char_p = ctypes.c_char_p(imatrix_bytes) - params.imatrix = ctypes.cast(char_p, ctypes.c_void_p) - logger.info(f"🧮 Using imatrix: {imatrix_path.name}") - - return params - - def _apply_tensor_overrides( - self, - params: llama_model_quantize_params, - embedding_type: str | None, - output_type: str | None, - ) -> None: - """Apply embedding and output tensor type overrides to params. - - These are the only tensor-specific controls that work reliably - with llama-cpp-python. - """ - # Apply embedding override if specified - if embedding_type: - params.token_embedding_type = self.get_tensor_type_value(embedding_type) - logger.info(f"āš™ļø Token embedding type: {embedding_type}") - - # Apply output override if specified - if output_type: - params.output_tensor_type = self.get_tensor_type_value(output_type) - params.quantize_output_tensor = True - logger.info(f"āš™ļø Output tensor type: {output_type}") - - def _do_quantisation( - self, - input_path: Path, - output_path: Path, - params: llama_model_quantize_params, - ) -> int: - """Perform the quantisation operation. - - Returns: - Return code (0 for success). - - Raises: - KeyboardInterrupt: If the user interrupts the quantisation process. - SystemExit: If the system exits during quantisation. - """ - logger.debug("DEBUG: Calling llama_cpp.llama_model_quantize") - try: - # Flush any pending output before calling C library - - sys.stdout.flush() - sys.stderr.flush() - - # Temporarily redirect stderr to prevent terminal control issues - # Some GGUF models output control sequences that can break the terminal - old_stderr_fd = None - devnull_fd = None - - try: - # Only redirect if not in debug mode to preserve error messages - if not logger.isEnabledFor(logging.DEBUG): - old_stderr_fd = os.dup(2) # Save current stderr - devnull_fd = os.open(os.devnull, os.O_WRONLY) - os.dup2(devnull_fd, 2) # Redirect stderr to /dev/null - - # Call the quantization with proper exception handling - result = llama_cpp.llama_model_quantize( - str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params - ) - - finally: - # Restore stderr if we redirected it - if old_stderr_fd is not None: - os.dup2(old_stderr_fd, 2) - os.close(old_stderr_fd) - if devnull_fd is not None: - os.close(devnull_fd) - - # Flush output after the call - sys.stdout.flush() - sys.stderr.flush() - except KeyboardInterrupt: - logger.error("āŒ Quantisation interrupted by user") - raise - except SystemExit as e: - logger.error(f"āŒ System exit during quantisation: {e}") - raise - except Exception as e: - logger.error(f"āŒ llama_model_quantize call failed: {e}") - logger.error("llama_model_quantize call traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - raise - else: - logger.debug(f"DEBUG: llama_model_quantize completed with code: {result}") - return result - - def quantise_model( - self, - input_path: Path, - output_path: Path, - config: QuantisationConfig, - imatrix_path: Path | None = None, - ) -> bool: - """Quantise model using Python API. - - Performs quantisation using llama-cpp-python's direct API access with - support for embedding and output tensor type overrides. The L and XL - variants use a base type with specific overrides. - - Returns: - True if quantisation successful, False otherwise. - - Raises: - RuntimeError: If llama-cpp-python is not available. - """ - if not LLAMA_CPP_AVAILABLE: - msg = "llama-cpp-python not available for quantisation" - raise RuntimeError(msg) - - # Force cleanup before starting - gc.collect() - - # Log initial resource state - mem_before = self._log_resource_state("before") - - try: - # Validate input - if not self._validate_input_file(input_path): - return False - # Setup parameters - params = self._setup_quantisation_params(config, imatrix_path) - if params is None: - return False - # Execute quantisation - result = self._execute_quantisation(input_path, output_path, params) - # Verify and finalize - if result == 0: - return self._finalize_successful_quantisation(output_path, mem_before) - - logger.error(f"āŒ Quantisation failed with code: {result}") - except Exception as e: - logger.error(f"āŒ Quantisation failed with exception: {e}") - logger.error("Full quantisation traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - # Garbage collect and return false - gc.collect() - return False - - def _log_resource_state(self, phase: str) -> float: - """Log current resource usage state. - - Args: - phase: Description of current phase (e.g. "before", "after"). - - Returns: - Current memory usage in GB. - """ - process = psutil.Process() - memory_gb = process.memory_info().rss / (1024**3) - logger.debug(f"DEBUG: Memory {phase} quantisation: {memory_gb:.2f} GB") - logger.debug(f"DEBUG: Open file descriptors: {len(process.open_files())}") - if phase == "before": - logger.debug(f"DEBUG: Process PID: {process.pid}") - return memory_gb - - def _validate_input_file(self, input_path: Path) -> bool: - """Validate input file exists and is readable. - - Args: - input_path: Path to input file. - - Returns: - True if file is valid, False otherwise. - """ - logger.debug(f"DEBUG: Starting quantisation of {input_path.name}") - logger.info(f"šŸ”„ Quantising {input_path.name}...") - logger.debug(f"DEBUG: Input: {input_path}") - - if not input_path.exists(): - logger.error(f"āŒ Input file does not exist: {input_path}") - return False - - if not input_path.is_file(): - logger.error(f"āŒ Input path is not a file: {input_path}") - return False - - try: - input_size = input_path.stat().st_size - logger.debug(f"DEBUG: Input file size: {input_size / (1024**3):.2f} GB") - if input_size == 0: - logger.error("āŒ Input file is empty") - return False - except Exception as e: - logger.warning(f"āš ļø Could not check input file size: {e}") - - return True - - def _setup_quantisation_params( - self, - config: QuantisationConfig, - imatrix_path: Path | None, - ) -> llama_model_quantize_params | None: - """Setup quantisation parameters. - - Args: - config: Quantisation configuration. - imatrix_path: Optional path to importance matrix. - - Returns: - Configured parameters or None if setup failed. - """ - logger.debug("DEBUG: Setting up quantisation parameters") - params = llama_model_quantize_params() - - # Set base quantisation type - try: - params.ftype = self.get_quantisation_type(config.base_type) - logger.debug( - f"DEBUG: Set ftype to {params.ftype} for {config.base_type} (config: {config.name})" - ) - except Exception as e: - logger.error(f"āŒ Failed to get quantisation type for {config.name}: {e}") - return None - - # Configure basic parameters - params.nthread = 8 - params.allow_requantize = True - logger.debug( - f"DEBUG: Set nthread={params.nthread}, allow_requantize={params.allow_requantize}" - ) - - # Add imatrix if available - if imatrix_path and imatrix_path.exists(): - try: - # Convert path to bytes and create c_char_p, then cast to c_void_p - imatrix_bytes = str(imatrix_path).encode("utf-8") - char_p = ctypes.c_char_p(imatrix_bytes) - params.imatrix = ctypes.cast(char_p, ctypes.c_void_p) - logger.info(f"🧮 Using imatrix: {imatrix_path.name}") - logger.debug(f"DEBUG: imatrix path set: {imatrix_path}") - except Exception as e: - logger.error(f"āŒ Failed to set imatrix: {e}") - # Continue without imatrix - - # Configure tensor-specific types - logger.debug("DEBUG: Configuring tensor-specific types") - try: - self._configure_tensor_types(params, config) - logger.debug("DEBUG: Tensor types configured successfully") - except Exception as e: - logger.error(f"āŒ Failed to configure tensor types: {e}") - logger.error("Tensor type configuration traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - # Continue with default types - - return params - - def _execute_quantisation( - self, - input_path: Path, - output_path: Path, - params: llama_model_quantize_params, - ) -> int: - """Execute the actual quantisation with signal handling. - - Args: - input_path: Path to input model. - output_path: Path for output model. - params: Configured quantisation parameters. - - Returns: - Return code from quantisation (0 for success). - """ - logger.debug("DEBUG: Starting llama_cpp.llama_model_quantize call") - logger.debug("DEBUG: About to call llama_model_quantize...") - - # Setup signal handlers - old_handlers = self._setup_signal_handlers() - - try: - result = llama_cpp.llama_model_quantize( - str(input_path).encode("utf-8"), str(output_path).encode("utf-8"), params - ) - logger.debug(f"DEBUG: llama_model_quantize returned: {result}") - except Exception as e: - logger.error(f"āŒ llama_model_quantize raised exception: {e}") - logger.error("llama_model_quantize traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - return -1 - else: - return result - finally: - self._restore_signal_handlers(old_handlers) - - def _setup_signal_handlers(self) -> tuple[Any, Any | None]: - """Setup signal handlers for debugging termination. - - Returns: - Tuple of (old_sigterm, old_sigsegv) handlers. - """ - - def signal_debug_handler(signum: int, frame: object) -> Never: # noqa: ARG001 - logger.error(f"DEBUG: Received signal {signum} during quantisation!") - logger.error(f"DEBUG: Signal name: {signal.Signals(signum).name}") - msg = f"Signal {signum} received" - raise KeyboardInterrupt(msg) - - old_sigterm = signal.signal(signal.SIGTERM, signal_debug_handler) - old_sigsegv = ( - signal.signal(signal.SIGSEGV, signal_debug_handler) - if hasattr(signal, "SIGSEGV") - else None - ) - return old_sigterm, old_sigsegv - - def _restore_signal_handlers(self, handlers: tuple[Any, Any | None]) -> None: - """Restore original signal handlers. - - Args: - handlers: Tuple of (old_sigterm, old_sigsegv) handlers. - """ - old_sigterm, old_sigsegv = handlers - signal.signal(signal.SIGTERM, old_sigterm) - if old_sigsegv is not None: - signal.signal(signal.SIGSEGV, old_sigsegv) - - def _finalize_successful_quantisation( - self, - output_path: Path, - mem_before: float, - ) -> bool: - """Finalize successful quantisation and verify output. - - Args: - output_path: Path to output file. - mem_before: Memory usage before quantisation in GB. - - Returns: - True if output is valid, False otherwise. - """ - logger.debug("DEBUG: Quantisation returned success code") - - # Verify output exists - if not output_path.exists(): - logger.error( - f"āŒ Quantisation claimed success but output does not exist: {output_path}" - ) - return False - - # Verify output size - output_size = output_path.stat().st_size - logger.debug(f"DEBUG: Output file size: {output_size / (1024**3):.2f} GB") - - if output_size == 0: - logger.error("āŒ Output file is empty despite success code") - return False - - logger.info(f"āœ… Quantisation successful: {output_path.name}") - - # Force cleanup and log final state - gc.collect() - mem_after = self._log_resource_state("after") - logger.debug(f"DEBUG: Memory delta: {mem_after - mem_before:+.2f} GB") - - return True - - def _configure_tensor_types( - self, params: llama_model_quantize_params, config: QuantisationConfig - ) -> None: - """Configure tensor-specific quantisation types. - - Sets embedding and output tensor type overrides based on config. - These are the only tensor-specific controls that work reliably - with llama-cpp-python. - """ - logger.debug(f"DEBUG: _configure_tensor_types called for {config.name}") - - # Apply embedding override if specified - if config.embedding_type: - params.token_embedding_type = self.get_tensor_type_value(config.embedding_type) - logger.info(f"āš™ļø Token embedding type: {config.embedding_type}") - - # Apply output override if specified - if config.output_type: - params.output_tensor_type = self.get_tensor_type_value(config.output_type) - params.quantize_output_tensor = True - logger.info(f"āš™ļø Output tensor type: {config.output_type}") - - def convert_hf_to_gguf( - self, input_dir: Path, output_path: Path, output_type: str = "f16" - ) -> bool: - """Convert HuggingFace model to GGUF format using native Python converter. - - Uses our GGUFConverter for SafeTensors models, providing full Python-based - conversion without external dependencies. - - Returns: - True if conversion successful, False otherwise. - """ - logger.info(f"šŸ”„ Converting {input_dir.name} to GGUF format...") - logger.info(f"šŸ“ Input: {input_dir}") - logger.info(f"šŸ“ Output: {output_path}") - logger.info(f"šŸ“ Type: {output_type}") - - # Check for SafeTensors files - safetensor_files = list(input_dir.glob("*.safetensors")) - if not safetensor_files: - logger.warning("āš ļø No SafeTensors files found in model directory") - return False - - try: - # Load model configuration - config_parser = ConfigParser() - model_config = config_parser.load_model_config(input_dir) - - # Get architecture mapping - arch_name = model_config.architectures[0] if model_config.architectures else "llama" - arch = config_parser.get_architecture_mapping(arch_name) - - if arch != arch_name: - logger.info(f"šŸ“ Architecture mapping: {arch_name} → {arch}") - - # Convert using GGUFConverter - tensor_mapper = TensorMapper() - success = GGUFConverter.convert_safetensors( - input_dir, output_path, model_config, arch, tensor_mapper - ) - except Exception as e: - logger.error(f"āŒ Conversion failed with exception: {e}") - return False - else: - if success: - logger.info("āœ… Native Python conversion successful") - return success diff --git a/helpers/services/orchestrator.py b/helpers/services/orchestrator.py deleted file mode 100644 index 42d82db..0000000 --- a/helpers/services/orchestrator.py +++ /dev/null @@ -1,846 +0,0 @@ -"""Quantisation orchestration service. - -High-level orchestration of the complete quantisation workflow from model -acquisition through processing to upload. Manages parallel processing, -status tracking, and cleanup operations for efficient resource utilisation. -""" - -from __future__ import annotations - -import gc -import signal -import subprocess -import sys -import traceback -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING - -import psutil - -from helpers.config.quantisation_configs import ( - DEFAULT_QUANTISATION_TYPES, - QUANTISATION_CONFIGS, - SUPPORTED_QUANTISATION_TYPES, -) -from helpers.logger import logger -from helpers.models.quantisation import ( - ModelSource, - QuantisationContext, - QuantisationResult, - QuantisationType, -) -from helpers.services.huggingface import ReadmeGenerator -from helpers.services.imatrix_generator import IMatrixGenerator -from helpers.services.llama_cpp import IMatrixHandler -from helpers.services.quantisation import HuggingFaceUploader, ModelManager, QuantisationEngine -from helpers.utils.rate_limiter import ReadmeRateLimiter -from helpers.utils.tensor_mapping import URLParser - -if TYPE_CHECKING: - from types import FrameType - from typing import Any - - -@dataclass(slots=True) -class QuantisationOrchestrator: - """Orchestrates the complete quantisation workflow. - - Uses dataclass with slots for efficient memory usage and dependency injection - for modular service interaction following SOLID principles. - """ - - work_dir: Path = field(default_factory=lambda: Path.cwd() / "quantisation_work") - use_imatrix: bool = True - no_upload: bool = False - custom_profiles: list[str] | None = None - - # Service dependencies with factory defaults - url_parser: URLParser = field(default_factory=URLParser) - quantisation_engine: QuantisationEngine = field(default_factory=QuantisationEngine) - imatrix_handler: IMatrixHandler = field(default_factory=IMatrixHandler) - imatrix_generator: IMatrixGenerator = field(default_factory=IMatrixGenerator) - readme_generator: ReadmeGenerator = field(default_factory=ReadmeGenerator) - uploader: HuggingFaceUploader = field(default_factory=HuggingFaceUploader) - - # Computed properties - models_dir: Path = field(init=False) - model_manager: ModelManager = field(init=False) - readme_limiter: ReadmeRateLimiter = field(init=False) - - def __post_init__(self) -> None: - """Initialise computed properties after dataclass construction.""" - self.models_dir = self.work_dir / "models" - self.model_manager = ModelManager(self.models_dir) - self.readme_limiter = ReadmeRateLimiter(cooldown_seconds=30.0) - - # Set up signal handlers for graceful exit tracking - self._setup_signal_handlers() - - def _setup_signal_handlers(self) -> None: - """Set up signal handlers to catch unexpected exits.""" - - def signal_handler(signum: int, frame: FrameType | None) -> None: - logger.error(f"āŒ Received signal {signum} ({signal.Signals(signum).name})") - logger.error("Stack trace at signal:") - if frame: - for line in traceback.format_stack(frame): - logger.error(f" {line.strip()}") - logger.error("Exiting due to signal") - sys.exit(1) - - # Handle common termination signals - for sig in [signal.SIGINT, signal.SIGTERM]: - signal.signal(sig, signal_handler) - - def _check_architecture_support(self, f16_model_path: Path) -> bool: - """Check if the model architecture is supported by llama.cpp. - - Args: - f16_model_path: Path to the F16 GGUF model - - Returns: - True if architecture is NOT supported (K-quants should be skipped) - """ - try: - # Try a simple quantization with llama.cpp to check support - result = subprocess.run( - [ - ".cache/llm-gguf-tools/binaries/llama-quantize", - str(f16_model_path), - "/dev/null", - "Q4_K_M", - ], - check=False, - capture_output=True, - text=True, - timeout=5, - ) - - # Check if it failed due to unknown architecture - return bool(result.stderr and "unknown model architecture" in result.stderr.lower()) - except Exception: - # If we can't determine, assume it might work - return False - - def get_quantisation_types(self) -> list[QuantisationType]: - """Get the quantisation types to use for this run. - - Returns: - List of QuantisationType enums to process. - """ - if self.custom_profiles: - # Parse custom profiles from strings to QuantisationType - result = [] - for profile_str in self.custom_profiles: - try: - profile = QuantisationType(profile_str.upper()) - if profile in SUPPORTED_QUANTISATION_TYPES: - result.append(profile) - else: - logger.warning(f"Profile {profile_str} is not supported, skipping") - except ValueError: - logger.warning(f"Invalid profile {profile_str}, skipping") - return result or DEFAULT_QUANTISATION_TYPES - return DEFAULT_QUANTISATION_TYPES - - def quantise(self, url: str) -> dict[QuantisationType, QuantisationResult]: - """Main quantisation workflow orchestrating model processing from URL to upload. - - Returns: - dict[QuantisationType, QuantisationResult]: Quantisation results for each type. - - Raises: - KeyboardInterrupt: If the user interrupts the quantisation process. - """ - logger.info("Starting Bartowski quantisation process...") - logger.debug(f"DEBUG: Input URL: {url}") - logger.debug(f"DEBUG: Working directory: {self.work_dir}") - logger.debug(f"DEBUG: Use imatrix: {self.use_imatrix}") - logger.debug(f"DEBUG: No upload: {self.no_upload}") - logger.debug(f"DEBUG: Custom profiles: {self.custom_profiles}") - - try: - # Setup and preparation - logger.debug("DEBUG: Starting environment setup...") - model_source, f16_model_path, imatrix_path, output_repo = self._setup_environment(url) - logger.debug(f"DEBUG: Environment setup complete. F16 model: {f16_model_path}") - - # Create initial repository - logger.debug("DEBUG: Creating initial repository...") - self._create_initial_repository(model_source, output_repo) - logger.debug("DEBUG: Initial repository created") - - # Execute all quantisations - logger.debug("DEBUG: Starting quantisation execution...") - results = self._execute_quantisations( - model_source, f16_model_path, imatrix_path, output_repo - ) - logger.debug(f"DEBUG: Quantisation execution complete. Results: {len(results)} items") - - # Cleanup - logger.debug("DEBUG: Starting cleanup...") - self._cleanup_files(f16_model_path, model_source) - logger.debug("DEBUG: Cleanup complete") - - self._print_completion_summary(model_source, results, output_repo) - except KeyboardInterrupt: - logger.error("āŒ Process interrupted by user (Ctrl+C)") - raise - except Exception as e: - logger.error(f"āŒ Critical error in quantisation workflow: {e}") - logger.error("Full traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - raise - finally: - # Always flush pending README updates before exiting - self.readme_limiter.flush() - - return results - - def _setup_environment(self, url: str) -> tuple[ModelSource, Path, Path | None, str]: - """Setup environment and prepare model for quantisation. - - Returns: - Tuple of (model_source, f16_model_path, imatrix_path, output_repo). - """ - model_source = self.url_parser.parse(url) - self._print_model_info(model_source) - - self.models_dir.mkdir(parents=True, exist_ok=True) - f16_model_path = self.model_manager.prepare_model(model_source) - - output_repo = ( - f"{self.uploader.get_username()}/" - f"{model_source.original_author}-{model_source.model_name}-GGUF" - ) - - imatrix_path = None - if self.use_imatrix: - logger.info("Checking for importance matrix (imatrix)...") - model_dir = self.models_dir / model_source.model_name - imatrix_path = self.imatrix_handler.find_imatrix(model_dir) - - # If no imatrix found, offer to generate or provide one - if not imatrix_path: - # First offer to generate - imatrix_path = self.imatrix_generator.prompt_for_generation( - model_source, model_dir, f16_model_path - ) - - # If generation was skipped, offer to provide existing one - if not imatrix_path: - imatrix_path = self.imatrix_handler.prompt_for_user_imatrix(model_dir) - - return model_source, f16_model_path, imatrix_path, output_repo - - def _create_initial_repository(self, model_source: ModelSource, output_repo: str) -> None: - """Create initial repository with planned quantisations.""" - logger.info("Creating initial README with planned quantisations...") - quantisation_types = self.get_quantisation_types() - planned_results = { - qt: QuantisationResult(quantisation_type=qt, success=False, status="planned") - for qt in quantisation_types - } - readme_path = self.readme_generator.generate( - model_source, planned_results, self.models_dir, output_repo - ) - - if not self.no_upload: - logger.info("Creating repository with planned quantisations...") - self.uploader.upload_readme(output_repo, readme_path) - else: - logger.info("Skipping repository creation (--no-upload specified)") - - def _execute_quantisations( - self, - model_source: ModelSource, - f16_model_path: Path, - imatrix_path: Path | None, - output_repo: str, - ) -> dict[QuantisationType, QuantisationResult]: - """Execute all quantisation types with parallel uploads. - - Returns: - dict[QuantisationType, QuantisationResult]: Quantisation results for each type. - """ - results: dict[QuantisationType, QuantisationResult] = {} - - quantisation_types = self.get_quantisation_types() - types_list = [qt.value for qt in quantisation_types] - logger.info(f"Processing {len(quantisation_types)} quantisation types: {types_list}") - - # Check architecture support upfront - architecture_unsupported = self._check_architecture_support(f16_model_path) - - if architecture_unsupported: - logger.warning("āš ļø Architecture not supported by llama.cpp - K-quants will be skipped") - logger.info("šŸ’” Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated") - - # Pre-mark all K-quants as skipped - basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] - for quant_type in quantisation_types: - if quant_type.value not in basic_types: - results[quant_type] = QuantisationResult( - quantisation_type=quant_type, - success=False, - status="failed", - error_message="K-quant requires llama.cpp architecture support", - ) - - # Track F16 in results for status display (if we converted from SafeTensors) - if not model_source.is_gguf_repo: - # Get F16 file size - f16_size = "-" - if f16_model_path.exists(): - size_bytes = f16_model_path.stat().st_size - size_gb = size_bytes / (1024**3) - f16_size = f"{size_gb:.1f}GB" - - # Create a simple object for F16 tracking (not a QuantisationResult) - # since F16 isn't a quantisation type in our enum - f16_result = type( - "F16Result", - (), - { - "quantisation_type": "F16", - "success": True, - "status": "planned", - "file_path": f16_model_path, - "file_size": f16_size, - }, - )() - results[QuantisationType.F16] = f16_result - - # Process with parallel uploads - quantise sequentially but upload in background - upload_futures: list[Any] = [] - architecture_unsupported = False - - with ThreadPoolExecutor(max_workers=2, thread_name_prefix="upload") as upload_executor: - # Start F16 upload first if we have one - if ( - not model_source.is_gguf_repo - and not self.no_upload - and QuantisationType.F16 in results - ): - f16_result = results[QuantisationType.F16] - if f16_result.file_path and f16_result.file_path.exists(): - logger.info("Starting parallel upload of F16 GGUF...") - f16_result.status = "uploading" - self._update_readme_status(model_source, results, output_repo) - - upload_future = upload_executor.submit( - self._upload_f16_and_cleanup, - output_repo, - f16_result.file_path, - model_source, - results, - ) - upload_futures.append(upload_future) - for i, quant_type in enumerate(quantisation_types, 1): - # Skip if already marked as failed (e.g., K-quants for unsupported arch) - if quant_type in results and results[quant_type].status == "failed": - logger.info( - f"Skipping {quant_type.value} - {results[quant_type].error_message}" - ) - continue - - logger.info( - f"Processing quantisation {i}/{len(quantisation_types)}: {quant_type.value}" - ) - logger.debug(f"DEBUG: Starting quantisation {i}/{len(quantisation_types)}") - logger.debug(f"DEBUG: Current type: {quant_type.value}") - logger.debug(f"DEBUG: Results so far: {len(results)} completed") - - try: - result = self._process_single_quantisation( - quant_type, - model_source, - f16_model_path, - imatrix_path, - output_repo, - results, - upload_executor, - upload_futures, - ) - results[quant_type] = result - logger.debug(f"DEBUG: Quantisation {quant_type.value} completed") - - # Check if this failed due to unsupported architecture - if ( - not result.success - and hasattr(self.quantisation_engine.executor, "last_error") - and self.quantisation_engine.executor.last_error - == "unsupported_architecture" - ): - logger.warning( - "āš ļø Architecture not supported by llama.cpp - K-quants will be skipped" - ) - logger.info( - "šŸ’” Basic types (Q4_0, Q5_0, Q6_0, Q8_0) will still be generated" - ) - architecture_unsupported = True - # Update the current result to also show as skipped - result.error_message = "Architecture not supported by llama.cpp" - # Update README immediately to show remaining K-quants as skipped - # But don't mark basic types as failed - they can still use GGML - basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] - for remaining_quant_type in quantisation_types[i:]: - if remaining_quant_type not in results: - # Only mark K-quants as failed due to architecture - if remaining_quant_type.value not in basic_types: - results[remaining_quant_type] = QuantisationResult( - quantisation_type=remaining_quant_type, - success=False, - status="failed", - error_message="K-quant requires llama.cpp architecture support", - ) - self._update_readme_status(model_source, results, output_repo) - - # Force cleanup between quantisations - gc.collect() - logger.debug("DEBUG: Garbage collection completed") - - except Exception as e: - logger.error(f"āŒ Critical error processing {quant_type.value}: {e}") - logger.error("Exception traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - results[quant_type] = QuantisationResult( - quantisation_type=quant_type, - success=False, - status="failed", - error_message=str(e), - ) - - # Force cleanup after error - gc.collect() - - # Wait for all uploads to complete before returning - self._wait_for_uploads(upload_futures) - - # Final README update to ensure all statuses are accurate - if not self.no_upload and upload_futures: - logger.info("Updating README with final status...") - final_readme = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo - ) - self.uploader.upload_readme(output_repo, final_readme) - - return results - - def _process_single_quantisation( - self, - quant_type: QuantisationType, - model_source: ModelSource, - f16_model_path: Path, - imatrix_path: Path | None, - output_repo: str, - results: dict[QuantisationType, QuantisationResult], - upload_executor: ThreadPoolExecutor, - upload_futures: list, - ) -> QuantisationResult: - """Process a single quantisation type. - - Returns: - QuantisationResult: Result of the quantisation attempt. - """ - try: - logger.info(f"Starting {quant_type.value} quantisation...") - logger.debug(f"DEBUG: Getting config for {quant_type.value}") - config = QUANTISATION_CONFIGS[quant_type] - logger.debug(f"DEBUG: Config loaded: {config.name}") - - # Update status to processing - logger.debug("DEBUG: Creating initial quantisation result") - result = QuantisationResult(quantisation_type=quant_type, success=False) - result.status = "processing" - results[quant_type] = result - - logger.debug("DEBUG: Updating README status") - self._update_readme_status(model_source, results, output_repo) - - # Perform quantisation - logger.debug("DEBUG: Creating quantisation context") - context = QuantisationContext( - f16_model_path=f16_model_path, - model_source=model_source, - config=config, - models_dir=self.models_dir, - imatrix_path=imatrix_path, - ) - logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}") - logger.debug(f"DEBUG: imatrix path: {imatrix_path}") - logger.debug("DEBUG: Calling quantisation engine...") - result = self.quantisation_engine.quantise(context) - logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}") - - self._handle_quantisation_result( - result, - quant_type, - model_source, - results, - output_repo, - upload_executor, - upload_futures, - ) - except Exception as e: - return self._handle_quantisation_error( - e, quant_type, model_source, results, output_repo - ) - else: - return result - - def _process_single_quantisation_sequential( - self, - quant_type: QuantisationType, - model_source: ModelSource, - f16_model_path: Path, - imatrix_path: Path | None, - output_repo: str, - results: dict[QuantisationType, QuantisationResult], - ) -> QuantisationResult: - """Process a single quantisation type sequentially with immediate upload. - - Returns: - QuantisationResult: Result of the quantisation attempt. - """ - # Force cleanup before starting new quantisation - gc.collect() - - # Log system state before quantisation - process = psutil.Process() - logger.debug(f"DEBUG: === System state before {quant_type.value} ===") - logger.debug(f"DEBUG: Process alive: {process.is_running()}") - logger.debug(f"DEBUG: PID: {process.pid}") - logger.debug(f"DEBUG: Memory: {process.memory_info().rss / (1024**3):.2f} GB") - logger.debug(f"DEBUG: CPU percent: {process.cpu_percent()}%") - logger.debug(f"DEBUG: Threads: {process.num_threads()}") - logger.debug(f"DEBUG: Open files: {len(process.open_files())}") - - try: - logger.info(f"Starting {quant_type.value} quantisation...") - logger.debug(f"DEBUG: Getting config for {quant_type.value}") - config = QUANTISATION_CONFIGS[quant_type] - logger.debug(f"DEBUG: Config loaded: {config.name}") - - # Update status to processing - logger.debug("DEBUG: Creating initial quantisation result") - result = QuantisationResult(quantisation_type=quant_type, success=False) - result.status = "processing" - results[quant_type] = result - - logger.debug("DEBUG: Updating README status") - self._update_readme_status(model_source, results, output_repo) - - # Perform quantisation - logger.debug("DEBUG: Creating quantisation context") - context = QuantisationContext( - f16_model_path=f16_model_path, - model_source=model_source, - config=config, - models_dir=self.models_dir, - imatrix_path=imatrix_path, - ) - logger.debug(f"DEBUG: Context created. F16 path: {f16_model_path}") - logger.debug(f"DEBUG: imatrix path: {imatrix_path}") - logger.debug("DEBUG: Calling quantisation engine...") - result = self.quantisation_engine.quantise(context) - logger.debug(f"DEBUG: Quantisation engine returned: success={result.success}") - - if result.success and result.file_path: - # Upload immediately (if not in no-upload mode) - if not self.no_upload: - logger.info(f"Uploading {quant_type.value}...") - try: - self.uploader.upload_model_file(output_repo, result.file_path) - logger.info(f"Upload of {quant_type.value} completed successfully") - - # Clean up file after successful upload - logger.info(f"Removing {result.file_path.name} to save disk space...") - result.file_path.unlink() - - result.status = "completed" - self._update_readme_status(model_source, results, output_repo) - except Exception as upload_error: - logger.error(f"Failed to upload {quant_type.value}: {upload_error}") - result.status = "failed" - result.error_message = str(upload_error) - self._update_readme_status(model_source, results, output_repo) - # Keep file if upload failed - else: - # No upload mode - just mark as completed - result.status = "completed" - logger.info(f"Skipping upload of {quant_type.value} (--no-upload specified)") - else: - result.status = "failed" - self._update_readme_status(model_source, results, output_repo) - except Exception as e: - logger.error(f"Error processing {quant_type.value}: {e}") - result = QuantisationResult(quantisation_type=quant_type, success=False) - result.status = "failed" - result.error_message = str(e) - - try: - self._update_readme_status(model_source, results, output_repo) - except Exception as readme_error: - logger.error(f"Failed to update README after error: {readme_error}") - # Force cleanup after error - gc.collect() - return result - else: - # Force cleanup after quantisation - gc.collect() - return result - - def _handle_quantisation_result( - self, - result: QuantisationResult, - quant_type: QuantisationType, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - output_repo: str, - upload_executor: ThreadPoolExecutor, - upload_futures: list, - ) -> None: - """Handle successful or failed quantisation result.""" - if result.success and result.file_path: - quant_str = getattr(result.quantisation_type, "value", result.quantisation_type) - logger.info(f"Starting parallel upload of {quant_str}...") - upload_future = upload_executor.submit( - self._upload_and_cleanup, - output_repo, - result.file_path, - quant_type, - model_source, - results, - ) - upload_futures.append(upload_future) - result.file_path = None # Mark as being uploaded - result.status = "uploading" - else: - result.status = "failed" - - self._update_readme_status(model_source, results, output_repo) - - def _handle_quantisation_error( - self, - error: Exception, - quant_type: QuantisationType, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - output_repo: str, - ) -> QuantisationResult: - """Handle quantisation processing error. - - Returns: - QuantisationResult: Failed quantisation result with error information. - """ - logger.error(f"Error processing {quant_type.value}: {error}") - result = QuantisationResult(quantisation_type=quant_type, success=False) - result.status = "failed" - result.error_message = str(error) - - try: - self._update_readme_status(model_source, results, output_repo) - except Exception as readme_error: - logger.error(f"Failed to update README after error: {readme_error}") - - return result - - def _update_readme_status( - self, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - output_repo: str, - ) -> None: - """Update README with current quantisation status using rate limiting.""" - if not self.no_upload: - # Use rate limiter to batch updates - self.readme_limiter.request_update( - self._do_readme_update, - model_source, - results, - output_repo, - ) - - def _do_readme_update( - self, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - output_repo: str, - ) -> None: - """Actually perform the README update (called by rate limiter).""" - updated_readme_path = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo - ) - self.uploader.upload_readme(output_repo, updated_readme_path) - - def _wait_for_uploads(self, upload_futures: list) -> None: - """Wait for all parallel uploads to complete.""" - if not upload_futures: - return - - logger.info(f"Waiting for {len(upload_futures)} uploads to complete...") - completed = 0 - failed = 0 - - for future in upload_futures: - try: - future.result(timeout=300) # 5 minute timeout per upload - completed += 1 - logger.info(f"Upload progress: {completed}/{len(upload_futures)} completed") - except Exception as e: - failed += 1 - logger.warning(f"Upload error ({completed + failed}/{len(upload_futures)}): {e}") - - if failed > 0: - logger.warning(f"Upload summary: {completed} succeeded, {failed} failed") - else: - logger.info(f"All {completed} uploads completed successfully") - - def _cleanup_files(self, f16_model_path: Path, model_source: ModelSource) -> None: - """Clean up temporary files after processing.""" - if f16_model_path.exists(): - logger.info(f"Removing F16 model {f16_model_path.name} to save disk space...") - f16_model_path.unlink() - - if not model_source.is_gguf_repo: - self._cleanup_original_model(model_source) - - def _cleanup_original_model(self, model_source: ModelSource) -> None: - """Clean up original safetensors/PyTorch files after successful conversion.""" - model_dir = self.models_dir / model_source.model_name - - pytorch_files = list(model_dir.glob("pytorch_model*.bin")) - if pytorch_files: - logger.info(f"Removing {len(pytorch_files)} PyTorch model files to save disk space...") - for file in pytorch_files: - file.unlink() - - logger.info("Keeping config files, tokeniser, and metadata for reference") - - def _upload_and_cleanup( - self, - output_repo: str, - file_path: Path, - quant_type: QuantisationType, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - ) -> None: - """Upload file and clean up (runs in background thread).""" - try: - logger.info(f"[PARALLEL] Starting upload of {quant_type.value} ({file_path.name})") - self.uploader.upload_model_file(output_repo, file_path) - logger.info(f"[PARALLEL] Upload of {quant_type.value} completed successfully") - - logger.info(f"[PARALLEL] Removing {file_path.name} to save disk space...") - file_path.unlink() - - results[quant_type].status = "completed" - updated_readme_path = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo - ) - self.uploader.upload_readme(output_repo, updated_readme_path) - - logger.info(f"[PARALLEL] {quant_type} upload and cleanup complete") - except Exception as e: - logger.error(f"[PARALLEL] Failed to upload {quant_type}: {e}") - results[quant_type].status = "failed" - results[quant_type].error_message = str(e) - - try: - updated_readme_path = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo - ) - self.uploader.upload_readme(output_repo, updated_readme_path) - except Exception as readme_error: - logger.error( - f"[PARALLEL] Failed to update README after upload error: {readme_error}" - ) - # Don't re-raise - let other uploads continue - - def _upload_f16_and_cleanup( - self, - output_repo: str, - file_path: Path, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - ) -> None: - """Upload F16 file and clean up (runs in background thread).""" - try: - logger.info(f"[PARALLEL] Starting upload of F16 GGUF ({file_path.name})") - self.uploader.upload_model_file(output_repo, file_path) - logger.info("[PARALLEL] Upload of F16 GGUF completed successfully") - - # Don't delete F16 yet - we still need it for quantisations - # It will be deleted in _cleanup_files after all quantisations complete - - results[QuantisationType.F16].status = "completed" - updated_readme_path = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo - ) - self.uploader.upload_readme(output_repo, updated_readme_path) - - logger.info("[PARALLEL] F16 upload complete") - except Exception as e: - logger.error(f"[PARALLEL] Failed to upload F16: {e}") - results[QuantisationType.F16].status = "failed" - results[QuantisationType.F16].error_message = str(e) - - try: - updated_readme_path = self.readme_generator.generate( - model_source, results, self.models_dir, output_repo - ) - self.uploader.upload_readme(output_repo, updated_readme_path) - except Exception as readme_error: - logger.error( - f"[PARALLEL] Failed to update README after F16 upload error: {readme_error}" - ) - # Don't re-raise - let other uploads continue - - def _print_model_info(self, model_source: ModelSource) -> None: - """Print model information.""" - logger.info(f"Source URL: {model_source.url}") - logger.info(f"Source model: {model_source.source_model}") - logger.info(f"Original author: {model_source.original_author}") - logger.info(f"Model name: {model_source.model_name}") - logger.info(f"Your HF username: {self.uploader.get_username()}") - logger.info(f"Working directory: {self.work_dir}") - - def _print_completion_summary( - self, - model_source: ModelSource, - results: dict[QuantisationType, QuantisationResult], - output_repo: str, - ) -> None: - """Print completion summary.""" - successful_results = [r for r in results.values() if r.success] - - if successful_results: - logger.info("Complete! Your quantised models are available at:") - logger.info(f" https://huggingface.co/{output_repo}") - logger.info("Model info:") - logger.info(f" - Source URL: {model_source.url}") - logger.info(f" - Original: {model_source.source_model}") - logger.info( - " - Method: " - f"{'Direct GGUF download' if model_source.is_gguf_repo else 'HF model conversion'}" - ) - logger.info(f" - Quantised: {output_repo}") - - for result in successful_results: - if result.file_size: - filename = ( - f"{model_source.original_author}-{model_source.model_name}-" - f"{result.quantisation_type}.gguf" - ) - logger.info(f" - {result.quantisation_type}: {filename} ({result.file_size})") - else: - logger.error( - "All quantisations failed - repository created with documentation " - "but no model files" - ) - logger.error(f" Repository: https://huggingface.co/{output_repo}") diff --git a/helpers/services/quantisation.py b/helpers/services/quantisation.py deleted file mode 100644 index ae9cc6f..0000000 --- a/helpers/services/quantisation.py +++ /dev/null @@ -1,742 +0,0 @@ -"""Quantisation operations service. - -Provides modular quantisation engine, model management, and upload capabilities -for GGUF model processing. Consolidates quantisation logic from various tools -into reusable components following SOLID principles. -""" - -from __future__ import annotations - -import shutil -import subprocess -import tempfile -import time -import traceback -from pathlib import Path - -from helpers.logger import logger -from helpers.models.quantisation import ( - ModelSource, - QuantisationContext, - QuantisationResult, - QuantisationType, -) -from helpers.services.filesystem import FilesystemService -from helpers.services.ggml_quantise import GGMLQuantiser -from helpers.services.gguf import GGUFConverter -from helpers.services.llama_cpp import QuantisationExecutor -from helpers.utils.config_parser import ConfigParser -from helpers.utils.tensor_mapping import TensorMapper - - -class QuantisationEngine: - """Handles the actual quantisation process with configurable methods. - - Provides flexible quantisation execution supporting multiple tensor - precision configurations, importance matrices, and fallback strategies. - Uses direct llama.cpp binary execution with proper tensor overrides. - """ - - def __init__(self) -> None: - """Initialise quantisation engine.""" - self.fs = FilesystemService() - self.executor = QuantisationExecutor() - self.ggml_quantiser = GGMLQuantiser() - - def quantise(self, context: QuantisationContext) -> QuantisationResult: - """Perform quantisation using the specified configuration. - - Executes quantisation using direct llama.cpp binary with proper - tensor override flags for L and XL variants. Falls back to GGML - for basic types when architecture is unsupported. - - Returns: - QuantisationResult with success status and file information. - """ - logger.info( - f"āš™ļø Creating {context.config.name} quantisation ({context.config.description})..." - ) - - output_path = context.get_output_path() - - # Check input file exists and is readable - if not context.f16_model_path.exists(): - error_msg = f"Input model file does not exist: {context.f16_model_path}" - logger.error(f"āŒ {error_msg}") - return QuantisationResult( - quantisation_type=QuantisationType(context.config.name), - success=False, - error_message=error_msg, - ) - - logger.info(f"šŸŽÆ Attempting {context.config.name} quantisation...") - logger.info(f"šŸ“ Source: {context.f16_model_path}") - logger.info(f"šŸ“ Target: {output_path}") - - # Determine if this is a basic type that can use GGML - basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"] - is_basic_type = context.config.name in basic_types - - try: - # Try llama.cpp first for all types - logger.info("šŸ”§ Using llama.cpp binary for quantisation...") - - success = self.executor.execute_quantisation( - context.f16_model_path, output_path, context.config, context.imatrix_path - ) - - if success: - return self._create_success_result(context.config.name, output_path, "llama.cpp") - - # Check if this was an architecture error and we can use GGML fallback - if ( - hasattr(self.executor, "last_error") - and self.executor.last_error == "unsupported_architecture" - and is_basic_type - ): - logger.info("šŸ”„ Architecture unsupported - using GGML implementation...") - - success = self.ggml_quantiser.try_alternative_quantisation( - context.f16_model_path, output_path, context.config.name - ) - - if success: - return self._create_success_result( - context.config.name, output_path, "GGML numpy" - ) - - logger.error(f"āŒ {context.config.name} quantisation failed") - return QuantisationResult( - quantisation_type=QuantisationType(context.config.name), - success=False, - error_message="Quantisation failed via Python API", - ) - - except Exception as e: - logger.error(f"āŒ Exception during {context.config.name} quantisation: {e}") - logger.error("Exception traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - - return QuantisationResult( - quantisation_type=QuantisationType(context.config.name), - success=False, - error_message=f"Exception during quantisation: {e!s}", - ) - - def _create_success_result( - self, quant_type: str, output_path: Path, method_used: str - ) -> QuantisationResult: - """Create successful quantisation result with file metadata. - - Returns: - QuantisationResult with file path and size information. - """ - file_size = self.fs.get_file_size(output_path) - return QuantisationResult( - quantisation_type=QuantisationType(quant_type), - success=True, - file_path=output_path, - file_size=file_size, - method_used=method_used, - ) - - -class ModelManager: - """Handles model downloading and preparation for quantisation. - - Manages both GGUF repository downloads and HuggingFace model conversions, - providing unified interface for model acquisition and preparation. - """ - - def __init__(self, models_dir: Path) -> None: - """Initialise model manager with storage configuration. - - Sets up model storage directory for model downloads and conversions. - """ - self.models_dir = models_dir - self.fs = FilesystemService() - - def prepare_model(self, model_source: ModelSource) -> Path: - """Prepare model for quantisation and return F16 model path. - - Handles both GGUF repository downloads and regular HuggingFace model - conversion workflows with automatic format detection. - - Returns: - Path to F16 GGUF model ready for quantisation. - """ - model_dir = self.models_dir / model_source.model_name - - if model_source.is_gguf_repo: - return self._handle_gguf_repo(model_source, model_dir) - return self._handle_regular_repo(model_source, model_dir) - - def _handle_gguf_repo(self, model_source: ModelSource, model_dir: Path) -> Path: - """Handle GGUF repository download with pattern matching. - - Downloads GGUF files matching specified patterns, prioritising - multi-part files and F16 variants. - - Returns: - Path to downloaded or existing GGUF file. - """ - logger.info(f"ā¬‡ļø Downloading GGUF file from repository: {model_source.source_model}") - logger.info(f"šŸ” Looking for file pattern: *{model_source.gguf_file_pattern}*") - - f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf" - - if f16_model.exists(): - logger.info(f"āœ… Found existing F16 file: {f16_model.name}") - return f16_model - - # Check for existing GGUF files - model_dir.mkdir(parents=True, exist_ok=True) - existing_gguf = self.fs.find_gguf_files(model_dir) - - if existing_gguf: - logger.info(f"āœ… Found existing GGUF file: {existing_gguf[0].name}") - return existing_gguf[0] - - # Download with patterns - downloaded_file = self._download_gguf_with_patterns( - model_source.source_model, model_source.gguf_file_pattern, model_dir - ) - - if downloaded_file: - # Handle multi-part files - if "00001-of-" in downloaded_file.name: - return downloaded_file - if "-00002-of-" in downloaded_file.name or "-00003-of-" in downloaded_file.name: - base_name = downloaded_file.name.replace("-00002-of-", "-00001-of-").replace( - "-00003-of-", "-00001-of-" - ) - first_part = downloaded_file.parent / base_name - if first_part.exists(): - logger.info(f"šŸ”„ Using first part: {first_part.name}") - return first_part - - # Rename single file to standard name - downloaded_file.rename(f16_model) - return f16_model - - # Fallback to regular conversion - logger.info("šŸ’” Falling back to downloading full repository and converting...") - return self._handle_regular_repo( - ModelSource(**{**model_source.dict(), "is_gguf_repo": False}), - model_dir, - ) - - def _download_gguf_with_patterns( - self, source_model: str, pattern: str | None, model_dir: Path - ) -> Path | None: - """Download GGUF file using various pattern strategies. - - Tries multiple pattern variations to find and download appropriate - GGUF files, handling timeouts and temporary directories. - - Returns: - Path to downloaded file, or None if all patterns fail. - """ - if pattern: - patterns = [ - f"*{pattern}*", - f"*{pattern.lower()}*", - f"*{pattern.upper()}*", - "*f16*", - "*F16*", - "*fp16*", - ] - else: - patterns = ["*f16*", "*F16*", "*fp16*"] - - temp_dir = model_dir / "gguf_temp" - - for search_pattern in patterns: - logger.info(f"šŸ” Trying pattern: {search_pattern}") - temp_dir.mkdir(exist_ok=True) - - try: - logger.debug( - f"DEBUG: Running huggingface-cli download for pattern {search_pattern}" - ) - result = subprocess.run( - [ - "timeout", - "300", - "huggingface-cli", - "download", - source_model, - "--include", - search_pattern, - "--local-dir", - str(temp_dir), - ], - check=True, - capture_output=True, - text=True, - ) - logger.debug( - f"DEBUG: Download command completed with return code {result.returncode}" - ) - - # Find downloaded GGUF files - gguf_files = self.fs.find_gguf_files(temp_dir, pattern) - if gguf_files: - found_file = gguf_files[0] - logger.info(f"āœ… Found GGUF file: {found_file.name}") - - # Move to parent directory - final_path = model_dir / found_file.name - shutil.move(str(found_file), str(final_path)) - shutil.rmtree(temp_dir) - return final_path - - except subprocess.CalledProcessError as e: - logger.debug( - f"DEBUG: Pattern {search_pattern} failed with return code {e.returncode}" - ) - if e.stderr: - logger.debug(f"DEBUG: stderr: {e.stderr}") - if e.stdout: - logger.debug(f"DEBUG: stdout: {e.stdout}") - logger.info(f"āš ļø Pattern {search_pattern} failed or timed out") - continue - except Exception as e: - logger.error(f"āŒ Unexpected error during download: {e}") - logger.error("Exception traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - continue - finally: - if temp_dir.exists(): - shutil.rmtree(temp_dir, ignore_errors=True) - - return None - - def _handle_regular_repo( - self, - model_source: ModelSource, - model_dir: Path, - ) -> Path: - """Handle regular HuggingFace repository conversion. - - Downloads full model repository and converts to F16 GGUF format - using our native Python-based GGUFConverter for SafeTensors models. - - Returns: - Path to converted F16 GGUF model. - """ - logger.info(f"ā¬‡ļø Downloading source model: {model_source.source_model}") - - # Download model if needed - if not model_dir.exists(): - self._download_repository(model_source.source_model, model_dir) - else: - logger.info("āœ… Model already downloaded") - - # Convert to GGUF - return self._convert_to_gguf(model_source, model_dir) - - def _download_repository(self, source_model: str, model_dir: Path) -> None: - """Download HuggingFace repository. - - Args: - source_model: HuggingFace model identifier. - model_dir: Local directory for download. - - Raises: - RuntimeError: If download fails. - """ - # Ensure the model directory and .huggingface subdirectory exist - model_dir.mkdir(parents=True, exist_ok=True) - huggingface_dir = model_dir / ".huggingface" - huggingface_dir.mkdir(parents=True, exist_ok=True) - - try: - logger.info(f"ā¬‡ļø Downloading full repository: {source_model}") - logger.info("šŸ“Š Progress will be shown below...") - - # Use subprocess.Popen to stream output in real-time - process = subprocess.Popen( - [ - "huggingface-cli", - "download", - source_model, - "--local-dir", - str(model_dir), - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, # Line buffered - universal_newlines=True, - ) - - # Stream output line by line - if process.stdout: - for line in process.stdout: - # Log download progress lines - if line.strip(): - # Check if it's a progress line (contains %) - if "%" in line or "Downloading" in line or "Fetching" in line: - # Use info level for progress lines - logger.info(f" {line.strip()}") - else: - # Use debug for other output - logger.debug(f" {line.strip()}") - - # Wait for process to complete - return_code = process.wait() - - if return_code != 0: - msg = f"Repository download failed with return code {return_code}" - raise RuntimeError(msg) - - logger.info("āœ… Repository download completed successfully") - - except subprocess.CalledProcessError as e: - logger.error(f"āŒ Failed to download repository {source_model}") - logger.error(f"Return code: {e.returncode}") - if e.stderr: - logger.error(f"stderr: {e.stderr}") - if e.stdout: - logger.error(f"stdout: {e.stdout}") - msg = f"Repository download failed: {e}" - raise RuntimeError(msg) from e - except Exception as e: - logger.error(f"āŒ Unexpected error during repository download: {e}") - logger.error("Exception traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - raise - - def _convert_to_gguf(self, model_source: ModelSource, model_dir: Path) -> Path: - """Convert model to GGUF F16 format. - - Args: - model_source: Model source information. - model_dir: Directory containing model files. - - Returns: - Path to F16 GGUF model. - - Raises: - RuntimeError: If conversion fails. - """ - logger.info("šŸ”„ Converting to GGUF F16 format...") - f16_model = model_dir / f"{model_source.original_author}-{model_source.model_name}-f16.gguf" - - if f16_model.exists(): - logger.info("āœ… F16 model already exists") - return f16_model - - # Check for SafeTensors files - safetensor_files = list(model_dir.glob("*.safetensors")) - if not safetensor_files: - logger.error("āŒ Model format not supported") - logger.info("šŸ’” This tool supports GGUF and SafeTensors formats") - msg = "Model must be in GGUF or SafeTensors format" - raise RuntimeError(msg) - - logger.info("šŸ Using native Python GGUFConverter...") - logger.info(f"āœ… Found {len(safetensor_files)} SafeTensors files") - - # Load model configuration - config_parser = ConfigParser() - model_config = config_parser.load_model_config(model_dir) - - # Get architecture mapping - arch_name = model_config.architectures[0] if model_config.architectures else "llama" - arch = config_parser.get_architecture_mapping(arch_name) - - if arch != arch_name: - logger.info(f"šŸ“ Architecture mapping: {arch_name} → {arch}") - - # Check if architecture is supported by llama.cpp - supported_archs = { - "llama", - "qwen2", - "gemma", - "phi3", - "falcon", - "gpt2", - "gptj", - "gptneox", - "mpt", - "baichuan", - "stablelm", - } - - if arch not in supported_archs: - logger.warning("=" * 70) - logger.warning(f"āš ļø Architecture '{arch_name}' may not be supported by llama.cpp") - logger.warning(f"āš ļø The GGUF will be created with architecture: '{arch}'") - logger.warning("āš ļø Check if your inference software supports this architecture.") - logger.warning("=" * 70) - - # Convert using GGUFConverter - tensor_mapper = TensorMapper() - success = GGUFConverter.convert_safetensors( - model_dir, f16_model, model_config, arch, tensor_mapper - ) - - if not success: - logger.error("āŒ Native Python conversion failed") - msg = "Failed to convert SafeTensors model to GGUF" - raise RuntimeError(msg) - - logger.info("āœ… Native Python conversion successful") - return f16_model - - -class HuggingFaceUploader: - """Handles uploading models and documentation to HuggingFace. - - Provides methods for repository creation, file uploads, and README - updates with proper error handling and retry logic. - """ - - @staticmethod - def get_username() -> str: - """Get authenticated HuggingFace username. - - Returns: - HuggingFace username from CLI authentication. - - Raises: - RuntimeError: If not authenticated. - """ - try: - result = subprocess.run( - ["huggingface-cli", "whoami"], - capture_output=True, - text=True, - check=True, - ) - return result.stdout.strip() - except (subprocess.CalledProcessError, FileNotFoundError) as err: - msg = "Please log in to HuggingFace first: huggingface-cli login" - raise RuntimeError(msg) from err - - def upload_readme(self, output_repo: str, readme_path: Path) -> None: - """Upload or update README file to repository. - - Creates repository if needed, handles existing repository updates. - - Raises: - RuntimeError: If the README upload fails. - """ - logger.info("Uploading README...") - - # Add delay to prevent rate limiting - time.sleep(2) - - # First ensure the repository exists - self._ensure_repo_exists(output_repo) - - # Upload without --create flag to avoid PR creation - try: - logger.debug(f"DEBUG: Uploading README to {output_repo}") - result = subprocess.run( - [ - "huggingface-cli", - "upload", - output_repo, - str(readme_path), - "README.md", - "--commit-message", - "Update README.md", - ], - check=True, - capture_output=True, - text=True, - ) - logger.debug(f"DEBUG: README upload completed with return code {result.returncode}") - except subprocess.CalledProcessError as e: - logger.error(f"āŒ Failed to upload README to {output_repo}") - logger.error(f"Return code: {e.returncode}") - if e.stderr: - logger.error(f"stderr: {e.stderr}") - if e.stdout: - logger.error(f"stdout: {e.stdout}") - msg = f"README upload failed: {e}" - raise RuntimeError(msg) from e - except Exception as e: - logger.error(f"āŒ Unexpected error during README upload: {e}") - logger.error("Exception traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - raise - logger.info("README uploaded") - - def _ensure_repo_exists(self, repo_id: str) -> None: - """Ensure the repository exists, creating it if necessary.""" - try: - # Try to create the repo - will fail if it already exists - subprocess.run( - [ - "huggingface-cli", - "repo", - "create", - repo_id, - "--type", - "model", - "-y", - ], - check=True, - capture_output=True, - text=True, - ) - logger.info(f"Created repository: {repo_id}") - except subprocess.CalledProcessError: - # Repository already exists, that's fine - pass - - def upload_model_file(self, output_repo: str, model_path: Path) -> None: - """Upload model file to repository. - - Uploads GGUF model file to specified repository path. - Always uses huggingface-cli to ensure proper handling of large files - via HuggingFace's xet backend. - - Raises: - RuntimeError: If the model file upload fails. - """ - logger.info(f"Uploading {model_path.name}...") - - # Add delay to prevent rate limiting - time.sleep(3) - - # Always use huggingface-cli for model files to ensure xet backend is used - try: - logger.debug(f"DEBUG: Uploading model file {model_path.name} to {output_repo}") - result = subprocess.run( - [ - "huggingface-cli", - "upload", - output_repo, - str(model_path), - model_path.name, - "--revision", - "main", # Explicitly push to main branch - "--commit-message", - f"Add {model_path.name}", - ], - check=True, - capture_output=True, - text=True, - ) - logger.debug(f"DEBUG: Model upload completed with return code {result.returncode}") - except subprocess.CalledProcessError as e: - logger.error(f"āŒ Failed to upload model file {model_path.name} to {output_repo}") - logger.error(f"Return code: {e.returncode}") - if e.stderr: - logger.error(f"stderr: {e.stderr}") - if e.stdout: - logger.error(f"stdout: {e.stdout}") - msg = f"Model file upload failed: {e}" - raise RuntimeError(msg) from e - except Exception as e: - logger.error(f"āŒ Unexpected error during model file upload: {e}") - logger.error("Exception traceback:") - for line in traceback.format_exc().splitlines(): - logger.error(f" {line}") - raise - - # Extract and log the URL if present in output - if result.stdout: - for line in result.stdout.splitlines(): - if "https://huggingface.co/" in line: - logger.info(f"Upload URL: {line.strip()}") - break - - logger.info(f"{model_path.name} uploaded") - - def _try_git_upload_file( - self, - repo_id: str, - local_path: Path, - repo_path: str, - *, - create_repo: bool = False, - ) -> bool: - """Try to upload file using git directly to avoid PR creation. - - Returns: - bool: True if upload successful, False if should fallback to CLI. - """ - try: - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - repo_url = f"https://huggingface.co/{repo_id}" - - # Clone repository - logger.info(f"Cloning {repo_url}...") - result = subprocess.run( - ["git", "clone", repo_url, str(temp_path / "repo")], - check=False, - capture_output=True, - text=True, - ) - - if result.returncode != 0: - if create_repo: - # Repository doesn't exist, let huggingface-cli handle creation - return False - logger.warning(f"Clone failed: {result.stderr}") - return False - - repo_dir = temp_path / "repo" - target_file = repo_dir / repo_path - - # Ensure target directory exists - target_file.parent.mkdir(parents=True, exist_ok=True) - - # Copy file - shutil.copy2(local_path, target_file) - - # Check if there are any changes - status_result = subprocess.run( - ["git", "status", "--porcelain"], - cwd=repo_dir, - capture_output=True, - text=True, - check=True, - ) - - if not status_result.stdout.strip(): - logger.info(f"No changes detected for {repo_path}, file already up-to-date") - return True # File is already up-to-date, no need to push - - # Git add, commit, push - subprocess.run( - ["git", "add", repo_path], - cwd=repo_dir, - check=True, - capture_output=True, - text=True, - ) - subprocess.run( - ["git", "commit", "-m", f"Update {repo_path}"], - cwd=repo_dir, - check=True, - capture_output=True, - text=True, - ) - subprocess.run( - ["git", "push"], - cwd=repo_dir, - check=True, - capture_output=True, - text=True, - ) - - return True - - except subprocess.CalledProcessError as e: - logger.warning(f"Git upload failed: {e}") - return False - except Exception as e: - logger.warning(f"Git upload error: {e}") - return False diff --git a/helpers/utils/config_parser.py b/helpers/utils/config_parser.py index 76690e1..46cfe36 100644 --- a/helpers/utils/config_parser.py +++ b/helpers/utils/config_parser.py @@ -9,8 +9,8 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any +from helpers.filesystem import FilesystemService from helpers.models.conversion import GGUFParameters, ModelConfig, VisionConfig -from helpers.services.filesystem import FilesystemService if TYPE_CHECKING: from pathlib import Path @@ -119,9 +119,10 @@ class ConfigParser: # DO NOT map incompatible architectures known_compatible = { "LlamaForCausalLM": "llama", - "MistralForCausalLM": "llama", # Mistral IS llama-compatible + "MistralForCausalLM": "llama", "Qwen2ForCausalLM": "qwen2", "GemmaForCausalLM": "gemma", + "GptOssForCausalLM": "gptoss", "Phi3ForCausalLM": "phi3", "FalconForCausalLM": "falcon", "GPT2LMHeadModel": "gpt2", @@ -144,7 +145,13 @@ class ConfigParser: arch_name = arch_name[: -len(suffix)] break - return arch_name.lower() + arch_name = arch_name.lower() + + # Special case: convert "gpt-oss" to "gptoss" + if arch_name == "gpt-oss": + arch_name = "gptoss" + + return arch_name @staticmethod def load_tokeniser_config(model_path: Path) -> dict[str, Any]: diff --git a/helpers/utils/rate_limiter.py b/helpers/utils/rate_limiter.py index 2331cd9..42f952c 100644 --- a/helpers/utils/rate_limiter.py +++ b/helpers/utils/rate_limiter.py @@ -26,8 +26,9 @@ class ReadmeRateLimiter: def __init__(self, cooldown_seconds: float = 30.0) -> None: """Initialise rate limiter with specified cooldown period. - Args: - cooldown_seconds: Minimum seconds between updates (default 30). + Sets up the rate limiter with the specified cooldown interval to + prevent excessive API calls whilst ensuring pending updates are + eventually processed through a timer-based batching mechanism. """ self.cooldown_seconds = cooldown_seconds self.last_update_time = 0.0 @@ -47,12 +48,8 @@ class ReadmeRateLimiter: """Request a README update, respecting rate limits. Updates are batched during cooldown periods and executed - when the cooldown expires. - - Args: - update_func: Function to call for the update - *args: Positional arguments for update_func - **kwargs: Keyword arguments for update_func + when the cooldown expires. Stores the update function and its + arguments for deferred execution whilst maintaining thread safety. """ with self.update_lock: current_time = time.time() diff --git a/quantise_gguf.py b/quantise_gguf.py index 53fde06..5db748a 100644 --- a/quantise_gguf.py +++ b/quantise_gguf.py @@ -17,7 +17,7 @@ import sys from pathlib import Path from helpers.logger import logger -from helpers.services.orchestrator import QuantisationOrchestrator +from helpers.quantisation import QuantisationOrchestrator def main() -> None: diff --git a/safetensors2gguf.py b/safetensors2gguf.py index aac724d..7bce398 100644 --- a/safetensors2gguf.py +++ b/safetensors2gguf.py @@ -12,8 +12,8 @@ import traceback from argparse import ArgumentParser from pathlib import Path +from helpers.gguf import GGUFConverter from helpers.logger import logger -from helpers.services.gguf import GGUFConverter from helpers.utils.config_parser import ConfigParser from helpers.utils.tensor_mapping import TensorMapper