374 lines
14 KiB
Python
374 lines
14 KiB
Python
"""GGUF file writing operations.
|
|
|
|
Provides high-level interface for creating GGUF files with metadata,
|
|
tensors, and tokeniser information.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import operator
|
|
import traceback
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Any, Protocol
|
|
|
|
import gguf
|
|
|
|
from helpers.logger import logger
|
|
|
|
if TYPE_CHECKING:
|
|
import numpy as np
|
|
|
|
from helpers.models.conversion import ModelConfig
|
|
|
|
|
|
class VisionConfig(Protocol):
|
|
"""Protocol for vision model configuration."""
|
|
|
|
hidden_size: int
|
|
num_hidden_layers: int
|
|
num_attention_heads: int
|
|
intermediate_size: int
|
|
patch_size: int
|
|
spatial_merge_size: int
|
|
|
|
|
|
class GGUFWriter:
|
|
"""Manages GGUF file creation and metadata writing.
|
|
|
|
Provides high-level interface for GGUF file operations including metadata
|
|
configuration, tensor addition, and tokeniser integration. Encapsulates
|
|
low-level GGUF library interactions for consistent error handling.
|
|
"""
|
|
|
|
def __init__(self, output_path: Path, architecture: str) -> None:
|
|
"""Initialise GGUF writer with output path and architecture.
|
|
|
|
Creates the underlying GGUF writer instance and prepares for metadata
|
|
and tensor addition. Sets up the file structure for the specified
|
|
model architecture.
|
|
"""
|
|
self.output_path = output_path
|
|
self.architecture = architecture
|
|
self.writer = gguf.GGUFWriter(str(output_path), architecture)
|
|
logger.info(f"Created GGUF writer for {architecture} architecture")
|
|
|
|
def add_metadata(self, model_config: ModelConfig, model_name: str) -> None:
|
|
"""Add comprehensive metadata from model configuration.
|
|
|
|
Writes general model information, architectural parameters, and
|
|
quantisation settings to the GGUF file header. Handles both standard
|
|
and vision model configurations with appropriate parameter mapping.
|
|
"""
|
|
# General metadata
|
|
self.writer.add_name(model_name)
|
|
self.writer.add_description(f"Converted from {model_config.architectures[0]}")
|
|
self.writer.add_file_type(gguf.LlamaFileType.ALL_F32)
|
|
|
|
# Log architecture being used
|
|
logger.info(f"Setting GGUF architecture: {self.architecture}")
|
|
if self.architecture not in {"llama", "qwen2", "gemma", "phi3", "falcon", "gpt2"}:
|
|
logger.warning(f"Architecture '{self.architecture}' may not be supported by llama.cpp")
|
|
|
|
# Model parameters from config
|
|
params = model_config.to_gguf_params()
|
|
self.writer.add_context_length(params.context_length)
|
|
self.writer.add_embedding_length(params.embedding_length)
|
|
self.writer.add_block_count(params.block_count)
|
|
self.writer.add_feed_forward_length(params.feed_forward_length)
|
|
self.writer.add_head_count(params.attention_head_count)
|
|
self.writer.add_head_count_kv(params.attention_head_count_kv)
|
|
self.writer.add_layer_norm_rms_eps(params.attention_layer_norm_rms_epsilon)
|
|
self.writer.add_rope_freq_base(params.rope_freq_base)
|
|
self.writer.add_rope_dimension_count(params.rope_dimension_count)
|
|
|
|
logger.info(f"Added metadata: {params.block_count} layers, {params.context_length} context")
|
|
|
|
def add_vision_metadata(self, vision_config: VisionConfig | None) -> None:
|
|
"""Add vision model parameters to GGUF metadata.
|
|
|
|
Configures vision-specific parameters for multimodal models including
|
|
embedding dimensions, attention heads, and spatial processing settings.
|
|
"""
|
|
if not vision_config:
|
|
return
|
|
|
|
logger.info("Adding vision model parameters...")
|
|
self.writer.add_vision_embedding_length(vision_config.hidden_size)
|
|
self.writer.add_vision_block_count(vision_config.num_hidden_layers)
|
|
self.writer.add_vision_head_count(vision_config.num_attention_heads)
|
|
self.writer.add_vision_feed_forward_length(vision_config.intermediate_size)
|
|
self.writer.add_vision_patch_size(vision_config.patch_size)
|
|
self.writer.add_vision_spatial_merge_size(vision_config.spatial_merge_size)
|
|
|
|
if hasattr(vision_config, "rms_norm_eps") and vision_config.rms_norm_eps:
|
|
self.writer.add_vision_attention_layernorm_eps(vision_config.rms_norm_eps)
|
|
|
|
def add_tokeniser(self, tokeniser_config: dict[str, Any]) -> None:
|
|
"""Add tokeniser metadata to GGUF file.
|
|
|
|
Writes special token IDs and tokeniser model type to enable proper
|
|
text processing during inference. Uses sensible defaults for missing
|
|
configuration values.
|
|
"""
|
|
self.writer.add_bos_token_id(tokeniser_config.get("bos_token_id", 1))
|
|
self.writer.add_eos_token_id(tokeniser_config.get("eos_token_id", 2))
|
|
self.writer.add_unk_token_id(tokeniser_config.get("unk_token_id", 0))
|
|
self.writer.add_pad_token_id(tokeniser_config.get("pad_token_id", 0))
|
|
|
|
# Add BOS/EOS token addition flags if available
|
|
if "add_bos_token" in tokeniser_config:
|
|
self.writer.add_add_bos_token(tokeniser_config["add_bos_token"])
|
|
if "add_eos_token" in tokeniser_config:
|
|
self.writer.add_add_eos_token(tokeniser_config["add_eos_token"])
|
|
|
|
# Note: tokenizer_model is set by add_tokeniser_vocabulary based on actual tokenizer type
|
|
|
|
logger.info("Added tokeniser configuration")
|
|
|
|
def add_tokeniser_vocabulary(self, model_path: Path) -> None:
|
|
"""Add full tokeniser vocabulary to GGUF file.
|
|
|
|
Loads and embeds the complete tokeniser vocabulary including tokens,
|
|
merges, and scores to enable standalone model usage without external
|
|
tokeniser files. Supports BPE, Unigram, and WordPiece tokenizers.
|
|
"""
|
|
tokenizer_path = model_path / "tokenizer.json"
|
|
if not tokenizer_path.exists():
|
|
logger.warning("tokenizer.json not found, skipping vocabulary embedding")
|
|
return
|
|
|
|
try:
|
|
with Path(tokenizer_path).open(encoding="utf-8") as f:
|
|
tokenizer_data = json.load(f)
|
|
|
|
model_data = tokenizer_data.get("model", {})
|
|
model_type = model_data.get("type", "")
|
|
|
|
# Get pre-tokenizer information
|
|
pre_tokenizer = tokenizer_data.get("pre_tokenizer", {})
|
|
pre_tokenizer_type = self._get_pre_tokenizer_type(pre_tokenizer)
|
|
|
|
# Get added tokens
|
|
added_tokens = tokenizer_data.get("added_tokens", [])
|
|
|
|
if model_type == "BPE":
|
|
self._add_bpe_tokenizer(model_data, added_tokens, pre_tokenizer_type)
|
|
elif model_type == "Unigram":
|
|
self._add_unigram_tokenizer(model_data, added_tokens)
|
|
elif model_type == "WordPiece":
|
|
self._add_wordpiece_tokenizer(model_data, added_tokens)
|
|
else:
|
|
logger.warning(f"Unsupported tokenizer type: {model_type}")
|
|
# Try to add as generic tokenizer
|
|
self._add_generic_tokenizer(model_data, tokenizer_data)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load tokeniser vocabulary: {e}")
|
|
logger.error(traceback.format_exc())
|
|
|
|
def _get_pre_tokenizer_type(self, pre_tokenizer: dict[str, Any]) -> str:
|
|
"""Determine pre-tokenizer type from configuration.
|
|
|
|
Returns:
|
|
Pre-tokenizer type.
|
|
"""
|
|
if not pre_tokenizer:
|
|
return "default"
|
|
|
|
# Check for various pre-tokenizer types
|
|
pre_type = pre_tokenizer.get("type", "")
|
|
if "ByteLevel" in str(pre_type):
|
|
return "llama3"
|
|
if "Metaspace" in str(pre_type):
|
|
return "default"
|
|
|
|
return "default"
|
|
|
|
def _add_bpe_tokenizer(
|
|
self,
|
|
model_data: dict[str, Any],
|
|
added_tokens: list[dict[str, Any]],
|
|
pre_tokenizer_type: str,
|
|
) -> None:
|
|
"""Add BPE tokenizer to GGUF file."""
|
|
vocab = model_data.get("vocab", {})
|
|
merges = model_data.get("merges", [])
|
|
|
|
# Set tokenizer model based on pre-tokenizer type
|
|
if pre_tokenizer_type == "llama3":
|
|
self.writer.add_tokenizer_model("gpt2")
|
|
self.writer.add_tokenizer_pre("llama3")
|
|
else:
|
|
self.writer.add_tokenizer_model("gpt2")
|
|
|
|
# Create token list with scores
|
|
tokens = []
|
|
scores = []
|
|
toktypes = []
|
|
|
|
# Add vocabulary tokens
|
|
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
|
|
tokens.append(token_str)
|
|
scores.append(0.0) # BPE doesn't use scores
|
|
|
|
# Determine token type
|
|
is_added = any(t.get("content") == token_str for t in added_tokens)
|
|
if is_added:
|
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
else:
|
|
toktypes.append(gguf.TokenType.NORMAL)
|
|
|
|
# Add to writer
|
|
self.writer.add_token_list(tokens)
|
|
self.writer.add_token_scores(scores)
|
|
self.writer.add_token_types(toktypes)
|
|
|
|
# Add merges
|
|
if merges:
|
|
self.writer.add_token_merges(merges)
|
|
|
|
logger.info(f"Added BPE tokenizer: {len(tokens)} tokens, {len(merges)} merges")
|
|
|
|
def _add_unigram_tokenizer(
|
|
self,
|
|
model_data: dict[str, Any],
|
|
added_tokens: list[dict[str, Any]],
|
|
) -> None:
|
|
"""Add Unigram tokenizer to GGUF file."""
|
|
vocab = model_data.get("vocab", [])
|
|
|
|
self.writer.add_tokenizer_model("unigram")
|
|
|
|
# Create token list with scores
|
|
tokens = []
|
|
scores = []
|
|
toktypes = []
|
|
|
|
# Add vocabulary tokens
|
|
for token_data in vocab:
|
|
if isinstance(token_data, list) and len(token_data) >= 2:
|
|
token_str, score = token_data[0], token_data[1]
|
|
else:
|
|
continue
|
|
|
|
tokens.append(token_str)
|
|
scores.append(float(score))
|
|
|
|
# Determine token type
|
|
is_added = any(t.get("content") == token_str for t in added_tokens)
|
|
if is_added:
|
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
else:
|
|
toktypes.append(gguf.TokenType.NORMAL)
|
|
|
|
# Add to writer
|
|
self.writer.add_token_list(tokens)
|
|
self.writer.add_token_scores(scores)
|
|
self.writer.add_token_types(toktypes)
|
|
|
|
logger.info(f"Added Unigram tokenizer: {len(tokens)} tokens")
|
|
|
|
def _add_wordpiece_tokenizer(
|
|
self,
|
|
model_data: dict[str, Any],
|
|
added_tokens: list[dict[str, Any]],
|
|
) -> None:
|
|
"""Add WordPiece tokenizer to GGUF file."""
|
|
vocab = model_data.get("vocab", {})
|
|
|
|
self.writer.add_tokenizer_model("bert")
|
|
|
|
# Create token list
|
|
tokens = []
|
|
scores = []
|
|
toktypes = []
|
|
|
|
# Add vocabulary tokens
|
|
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
|
|
tokens.append(token_str)
|
|
scores.append(0.0) # WordPiece doesn't use scores
|
|
|
|
# Determine token type
|
|
is_added = any(t.get("content") == token_str for t in added_tokens)
|
|
if is_added:
|
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
else:
|
|
toktypes.append(gguf.TokenType.NORMAL)
|
|
|
|
# Add to writer
|
|
self.writer.add_token_list(tokens)
|
|
self.writer.add_token_scores(scores)
|
|
self.writer.add_token_types(toktypes)
|
|
|
|
logger.info(f"Added WordPiece tokenizer: {len(tokens)} tokens")
|
|
|
|
def _add_generic_tokenizer(
|
|
self,
|
|
model_data: dict[str, Any],
|
|
tokenizer_data: dict[str, Any],
|
|
) -> None:
|
|
"""Add generic tokenizer as fallback."""
|
|
logger.warning("Using generic tokenizer fallback")
|
|
|
|
# Try to extract vocabulary from various possible locations
|
|
vocab = model_data.get("vocab", tokenizer_data.get("vocab", {}))
|
|
|
|
if not vocab:
|
|
logger.error("No vocabulary found in tokenizer")
|
|
return
|
|
|
|
self.writer.add_tokenizer_model("gpt2") # Default to GPT-2 style
|
|
|
|
# Create basic token list
|
|
tokens = []
|
|
scores = []
|
|
toktypes = []
|
|
|
|
if isinstance(vocab, dict):
|
|
# Dict-style vocab
|
|
for token_str, _token_id in sorted(vocab.items(), key=operator.itemgetter(1)):
|
|
tokens.append(token_str)
|
|
scores.append(0.0)
|
|
toktypes.append(gguf.TokenType.NORMAL)
|
|
elif isinstance(vocab, list):
|
|
# List-style vocab
|
|
for item in vocab:
|
|
if isinstance(item, str):
|
|
tokens.append(item)
|
|
scores.append(0.0)
|
|
toktypes.append(gguf.TokenType.NORMAL)
|
|
elif isinstance(item, list) and len(item) >= 1:
|
|
tokens.append(str(item[0]))
|
|
scores.append(float(item[1]) if len(item) > 1 else 0.0)
|
|
toktypes.append(gguf.TokenType.NORMAL)
|
|
|
|
if tokens:
|
|
self.writer.add_token_list(tokens)
|
|
self.writer.add_token_scores(scores)
|
|
self.writer.add_token_types(toktypes)
|
|
logger.info(f"Added generic tokenizer: {len(tokens)} tokens")
|
|
else:
|
|
logger.error("Failed to extract tokens from vocabulary")
|
|
|
|
def add_tensor(self, name: str, data: np.ndarray) -> None:
|
|
"""Add tensor to GGUF file.
|
|
|
|
Accepts a tensor name following GGUF naming conventions and its
|
|
corresponding numpy array data. The tensor is stored for writing
|
|
when the file is finalised.
|
|
"""
|
|
self.writer.add_tensor(name, data)
|
|
|
|
def write(self) -> None:
|
|
"""Finalise and write GGUF file to disk.
|
|
|
|
Writes header, key-value data, and tensors to the output file,
|
|
completing the GGUF creation process.
|
|
"""
|
|
logger.info(f"Writing GGUF file to {self.output_path}...")
|
|
self.writer.write_header_to_file()
|
|
self.writer.write_kv_data_to_file()
|
|
self.writer.write_tensors_to_file()
|
|
self.writer.close()
|
|
logger.info("✅ GGUF file written successfully")
|