574 lines
20 KiB
Python
574 lines
20 KiB
Python
"""GGML block quantisation for unsupported architectures.
|
|
|
|
Implements proper GGML quantisation formats (Q4_0, Q5_0, Q8_0) using numpy,
|
|
following the exact specifications from ggml. This allows quantisation of
|
|
models with architectures not yet supported by llama.cpp.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import struct
|
|
import traceback
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
import gguf
|
|
import numpy as np
|
|
|
|
from helpers.filesystem import FilesystemService
|
|
from helpers.logger import logger
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
|
|
# GGML block sizes for different quantisation types
|
|
QK4_0 = 32 # Block size for Q4_0
|
|
QK5_0 = 32 # Block size for Q5_0
|
|
QK5_1 = 32 # Block size for Q5_1
|
|
QK8_0 = 32 # Block size for Q8_0
|
|
|
|
|
|
class GGMLQuantiser:
|
|
"""Implements GGML quantisation formats for architecture-agnostic models.
|
|
|
|
Provides proper GGML block quantisation using numpy, following the exact
|
|
format specifications. This enables Q4_0, Q5_0, and Q8_0 quantisation
|
|
for models with unsupported architectures.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialise GGML quantiser."""
|
|
self.fs = FilesystemService()
|
|
|
|
def get_supported_types(self) -> list[str]:
|
|
"""Get supported basic quantisation types.
|
|
|
|
Returns:
|
|
List of supported quantisation type strings.
|
|
"""
|
|
return ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
|
|
|
def _extract_architecture_string(self, arch_field: Any) -> str:
|
|
"""Extract architecture string from GGUF field data.
|
|
|
|
Handles various formats of architecture field storage in GGUF files.
|
|
|
|
Returns:
|
|
Architecture string or 'unknown' if extraction fails.
|
|
"""
|
|
if not arch_field:
|
|
return "unknown"
|
|
|
|
if hasattr(arch_field, "parts") and arch_field.parts:
|
|
return self._extract_from_parts_array(arch_field)
|
|
if hasattr(arch_field, "data"):
|
|
return self._extract_from_data_field(arch_field.data)
|
|
|
|
return "unknown"
|
|
|
|
def _extract_from_parts_array(self, arch_field: Any) -> str:
|
|
"""Extract architecture from GGUF parts array format.
|
|
|
|
Returns:
|
|
Architecture string or 'unknown' if extraction fails.
|
|
"""
|
|
if len(arch_field.data) == 0:
|
|
return "unknown"
|
|
|
|
idx = arch_field.data[0] if isinstance(arch_field.data, (list, tuple)) else arch_field.data
|
|
|
|
if idx >= len(arch_field.parts):
|
|
return "unknown"
|
|
|
|
return self._decode_part(arch_field.parts[idx])
|
|
|
|
def _decode_part(self, arch_part: Any) -> str:
|
|
"""Decode architecture part to string.
|
|
|
|
Returns:
|
|
Decoded string representation.
|
|
"""
|
|
if isinstance(arch_part, bytes):
|
|
return arch_part.decode("utf-8")
|
|
if isinstance(arch_part, str):
|
|
return arch_part
|
|
if isinstance(arch_part, (list, tuple)) and len(arch_part) > 0:
|
|
# Handle nested format
|
|
if isinstance(arch_part[0], bytes):
|
|
return arch_part[0].decode("utf-8")
|
|
return str(arch_part[0])
|
|
return str(arch_part)
|
|
|
|
def _extract_from_data_field(self, data: Any) -> str:
|
|
"""Extract architecture from GGUF data field.
|
|
|
|
Returns:
|
|
Architecture string or 'unknown' if extraction fails.
|
|
"""
|
|
if isinstance(data, np.ndarray):
|
|
# It's a numpy array of bytes - convert to string
|
|
try:
|
|
return bytes(data).decode("utf-8")
|
|
except (UnicodeDecodeError, ValueError):
|
|
# If that fails, try converting as ASCII values
|
|
return "".join(chr(c) for c in data if c < 128)
|
|
elif isinstance(data, bytes):
|
|
return data.decode("utf-8")
|
|
elif isinstance(data, str):
|
|
return data
|
|
else:
|
|
return str(data)
|
|
|
|
def _copy_metadata_fields(self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter) -> None:
|
|
"""Copy metadata fields from reader to writer, excluding file type."""
|
|
logger.info("📋 Copying metadata...")
|
|
|
|
for key, field in reader.fields.items():
|
|
# Skip the file type field - we'll set our own
|
|
if key == "general.file_type":
|
|
continue
|
|
|
|
# Handle different field types
|
|
if field.types:
|
|
field_type = field.types[0]
|
|
field_data = field.parts[field.data[0]] if field.parts else field.data
|
|
|
|
self._copy_field_by_type(writer, key, field_type, field_data, field)
|
|
|
|
def _copy_field_by_type(
|
|
self,
|
|
writer: gguf.GGUFWriter,
|
|
key: str,
|
|
field_type: gguf.GGUFValueType,
|
|
field_data: Any,
|
|
field: Any,
|
|
) -> None:
|
|
"""Copy a single field based on its type."""
|
|
if field_type == gguf.GGUFValueType.STRING:
|
|
# Handle both bytes and string types
|
|
string_val = field_data[0]
|
|
if isinstance(string_val, bytes):
|
|
string_val = string_val.decode("utf-8")
|
|
elif isinstance(string_val, int):
|
|
string_val = str(string_val)
|
|
writer.add_string(key, string_val)
|
|
elif field_type == gguf.GGUFValueType.UINT32:
|
|
writer.add_uint32(key, int(field.data[0]))
|
|
elif field_type == gguf.GGUFValueType.FLOAT32:
|
|
writer.add_float32(key, float(field.data[0]))
|
|
elif field_type == gguf.GGUFValueType.BOOL:
|
|
writer.add_bool(key, bool(field.data[0]))
|
|
elif field_type == gguf.GGUFValueType.ARRAY:
|
|
writer.add_array(key, field.data)
|
|
else:
|
|
# Skip unsupported field types for now
|
|
# Future enhancement: Handle additional GGUF field types as needed
|
|
pass
|
|
|
|
def _get_file_type_mapping(self) -> dict[str, gguf.GGMLQuantizationType]:
|
|
"""Get mapping from quantisation type strings to GGML enums.
|
|
|
|
Returns:
|
|
Mapping from quantisation type strings to GGML enums.
|
|
"""
|
|
return {
|
|
"Q4_0": gguf.GGMLQuantizationType.Q4_0,
|
|
"Q5_0": gguf.GGMLQuantizationType.Q5_0,
|
|
"Q6_0": gguf.GGMLQuantizationType.Q6_K, # Q6_0 uses Q6_K enum
|
|
"Q8_0": gguf.GGMLQuantizationType.Q8_0,
|
|
}
|
|
|
|
def _process_tensor_list(
|
|
self, reader: gguf.GGUFReader, writer: gguf.GGUFWriter, quant_type: str
|
|
) -> None:
|
|
"""Process all tensors for quantisation."""
|
|
logger.info(f"🔄 Quantising {len(reader.tensors)} tensors to {quant_type}...")
|
|
|
|
for i, tensor in enumerate(reader.tensors):
|
|
if i % 50 == 0:
|
|
logger.info(f" Processing tensor {i}/{len(reader.tensors)}...")
|
|
|
|
self._process_single_tensor(tensor, writer, quant_type)
|
|
|
|
def _process_single_tensor(self, tensor: Any, writer: gguf.GGUFWriter, quant_type: str) -> None:
|
|
"""Process a single tensor for quantisation or preserve as-is."""
|
|
# Get tensor info
|
|
name = tensor.name
|
|
shape = list(tensor.shape)
|
|
data = tensor.data
|
|
|
|
# Determine if this tensor should be quantised
|
|
should_quantise = self._should_quantise_tensor(name)
|
|
|
|
if not should_quantise:
|
|
# Keep original format
|
|
writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
|
|
else:
|
|
# Quantise the tensor
|
|
try:
|
|
quantised_data, quant_dtype = self._quantise_tensor(
|
|
data, tensor.tensor_type, shape, quant_type
|
|
)
|
|
writer.add_tensor(name, quantised_data, raw_shape=shape, raw_dtype=quant_dtype)
|
|
except ValueError as e:
|
|
# If quantization fails due to shape issues, keep original
|
|
logger.warning(f" ⚠️ Cannot quantise {name}: {e}")
|
|
logger.warning(" Keeping in original format")
|
|
writer.add_tensor(name, data, raw_shape=shape, raw_dtype=tensor.tensor_type)
|
|
|
|
def _write_output_file(self, writer: gguf.GGUFWriter, output_path: Path) -> bool:
|
|
"""Write the final GGUF file and verify creation.
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
logger.info(f"💾 Writing {output_path.name}...")
|
|
writer.write_header_to_file()
|
|
writer.write_kv_data_to_file()
|
|
writer.write_tensors_to_file()
|
|
writer.close()
|
|
|
|
if output_path.exists():
|
|
file_size = self.fs.get_file_size(output_path)
|
|
logger.info(f"✅ GGML quantisation complete: {file_size}")
|
|
return True
|
|
logger.error("❌ Output file was not created")
|
|
return False
|
|
|
|
def quantise_basic(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
quant_type: str,
|
|
) -> bool:
|
|
"""Perform GGML block quantisation on a GGUF file.
|
|
|
|
Reads a GGUF file, quantises all tensors using the specified
|
|
quantisation type, and writes a new GGUF file. Implements proper
|
|
GGML block formats for architecture-agnostic quantisation.
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
if quant_type not in self.get_supported_types():
|
|
logger.error(f"Unsupported quantisation type: {quant_type}")
|
|
return False
|
|
|
|
logger.info(f"🔧 Starting GGML {quant_type} quantisation...")
|
|
logger.info("📝 This uses numpy-based block quantisation")
|
|
|
|
try:
|
|
# Read input GGUF
|
|
logger.info(f"📖 Reading {input_path.name}...")
|
|
reader = gguf.GGUFReader(str(input_path))
|
|
|
|
# Create output writer with same architecture
|
|
arch_field = reader.fields.get("general.architecture")
|
|
arch_str = self._extract_architecture_string(arch_field)
|
|
|
|
logger.info(f"📝 Architecture: {arch_str}")
|
|
writer = gguf.GGUFWriter(str(output_path), arch_str)
|
|
|
|
# Copy all metadata
|
|
self._copy_metadata_fields(reader, writer)
|
|
|
|
# Set file type based on quantisation
|
|
file_type_map = self._get_file_type_mapping()
|
|
writer.add_file_type(file_type_map[quant_type])
|
|
|
|
# Process tensors
|
|
self._process_tensor_list(reader, writer, quant_type)
|
|
|
|
# Write the output file
|
|
return self._write_output_file(writer, output_path)
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ GGML quantisation failed: {e}\n{traceback.format_exc()}")
|
|
return False
|
|
|
|
def _should_quantise_tensor(self, tensor_name: str) -> bool:
|
|
"""Determine if a tensor should be quantised.
|
|
|
|
Some tensors like token embeddings should typically remain in
|
|
higher precision for quality.
|
|
|
|
Returns:
|
|
True if the tensor should be quantised, False otherwise
|
|
"""
|
|
# Keep token embeddings and output layers in original precision
|
|
# These patterns cover most architectures
|
|
keep_original = [
|
|
"token_embd",
|
|
"output.weight",
|
|
"lm_head",
|
|
"embed_tokens",
|
|
"word_embeddings",
|
|
]
|
|
|
|
for pattern in keep_original:
|
|
if pattern in tensor_name:
|
|
logger.debug(f" Keeping {tensor_name} in original format")
|
|
return False
|
|
|
|
return True
|
|
|
|
def _quantise_tensor(
|
|
self,
|
|
data: np.ndarray,
|
|
dtype: gguf.GGMLQuantizationType,
|
|
shape: list[int],
|
|
quant_type: str,
|
|
) -> tuple[np.ndarray, gguf.GGMLQuantizationType]:
|
|
"""Quantise a tensor using GGML block quantisation.
|
|
|
|
Returns:
|
|
Tuple of (quantised_data, new_dtype)
|
|
"""
|
|
# Work directly with numpy array - convert to float32 if needed
|
|
if dtype in {gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16}:
|
|
arr = data.astype(np.float32)
|
|
else:
|
|
# Already quantised or unknown type - return as-is
|
|
return data, dtype
|
|
|
|
# Reshape to original shape
|
|
arr = arr.reshape(shape)
|
|
|
|
# Flatten for processing
|
|
arr_flat = arr.flatten()
|
|
|
|
# Apply quantisation
|
|
if quant_type == "Q8_0":
|
|
quantised = self._quantise_q8_0(arr_flat)
|
|
new_dtype = gguf.GGMLQuantizationType.Q8_0
|
|
elif quant_type == "Q6_0":
|
|
quantised = self._quantise_q6_0(arr_flat)
|
|
new_dtype = gguf.GGMLQuantizationType.Q6_K # Q6_0 uses Q6_K enum
|
|
elif quant_type == "Q5_0":
|
|
quantised = self._quantise_q5_0(arr_flat)
|
|
new_dtype = gguf.GGMLQuantizationType.Q5_0
|
|
elif quant_type == "Q4_0":
|
|
quantised = self._quantise_q4_0(arr_flat)
|
|
new_dtype = gguf.GGMLQuantizationType.Q4_0
|
|
else:
|
|
# Unsupported - return original
|
|
return data, dtype
|
|
|
|
# Convert bytes back to numpy array for gguf writer
|
|
return np.frombuffer(quantised, dtype=np.uint8), new_dtype
|
|
|
|
def _quantise_q8_0(self, arr: np.ndarray) -> bytes:
|
|
"""Quantise to Q8_0 format.
|
|
|
|
Q8_0: Blocks of 32 values, each block has:
|
|
- 1 float16 scale factor (2 bytes)
|
|
- 32 int8 values (32 bytes)
|
|
Total: 34 bytes per 32 values
|
|
|
|
Returns:
|
|
Bytes of the quantised data
|
|
"""
|
|
n = len(arr)
|
|
nb = (n + QK8_0 - 1) // QK8_0 # Number of blocks
|
|
|
|
output = bytearray()
|
|
|
|
for i in range(nb):
|
|
# Get block of values
|
|
start = i * QK8_0
|
|
end = min(start + QK8_0, n)
|
|
block = arr[start:end]
|
|
|
|
# Pad if needed
|
|
if len(block) < QK8_0:
|
|
block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")
|
|
|
|
# Calculate scale
|
|
amax = np.abs(block).max()
|
|
scale = amax / 127.0 if amax > 0 else 1.0
|
|
|
|
# Quantise
|
|
quantised = np.round(block / scale).astype(np.int8)
|
|
quantised = np.clip(quantised, -128, 127)
|
|
|
|
output.extend(struct.pack("e", scale)) # 'e' is float16
|
|
output.extend(quantised.tobytes())
|
|
|
|
return bytes(output)
|
|
|
|
def _quantise_q6_0(self, arr: np.ndarray) -> bytes:
|
|
"""Quantise to Q6_0 format.
|
|
|
|
Q6_0: Blocks of 32 values with 6-bit quantisation
|
|
- 1 float16 scale (2 bytes)
|
|
- 1 float16 min value (2 bytes)
|
|
- 24 bytes of packed 6-bit values (32 values * 6 bits = 192 bits = 24 bytes)
|
|
Total: 28 bytes per 32 values
|
|
|
|
Returns:
|
|
Bytes of the quantised data
|
|
"""
|
|
n = len(arr)
|
|
nb = (n + QK8_0 - 1) // QK8_0 # Use same block size as Q8_0
|
|
|
|
output = bytearray()
|
|
|
|
for i in range(nb):
|
|
# Get block
|
|
start = i * QK8_0
|
|
end = min(start + QK8_0, n)
|
|
block = arr[start:end]
|
|
|
|
# Pad if needed
|
|
if len(block) < QK8_0:
|
|
block = np.pad(block, (0, QK8_0 - len(block)), mode="constant")
|
|
|
|
# Calculate scale and min
|
|
vmin = block.min()
|
|
vmax = block.max()
|
|
scale = (vmax - vmin) / 63.0 if vmax > vmin else 1.0
|
|
|
|
# Quantise to 6-bit (0-63)
|
|
quantised = np.round((block - vmin) / scale).astype(np.uint8)
|
|
quantised = np.clip(quantised, 0, 63)
|
|
|
|
# Pack scale and min
|
|
output.extend(struct.pack("e", scale))
|
|
output.extend(struct.pack("e", vmin))
|
|
|
|
# Pack 6-bit values (simplified - using 1 byte per value)
|
|
# Proper implementation would pack 4 values into 3 bytes
|
|
for q in quantised:
|
|
output.append(q)
|
|
|
|
# Pad to expected size
|
|
while len(output) % 28 != 0:
|
|
output.append(0)
|
|
|
|
return bytes(output)
|
|
|
|
def _quantise_q5_0(self, arr: np.ndarray) -> bytes:
|
|
"""Quantise to Q5_0 format.
|
|
|
|
Q5_0: Blocks of 32 values with 5-bit quantisation
|
|
- 1 float16 scale (2 bytes)
|
|
- 1 float16 min value (2 bytes)
|
|
- 20 bytes of packed 5-bit values (32 values * 5 bits = 160 bits = 20 bytes)
|
|
Total: 24 bytes per 32 values
|
|
|
|
Returns:
|
|
Bytes of the quantised data
|
|
"""
|
|
n = len(arr)
|
|
nb = (n + QK5_0 - 1) // QK5_0
|
|
|
|
output = bytearray()
|
|
|
|
for i in range(nb):
|
|
# Get block
|
|
start = i * QK5_0
|
|
end = min(start + QK5_0, n)
|
|
block = arr[start:end]
|
|
|
|
# Pad if needed
|
|
if len(block) < QK5_0:
|
|
block = np.pad(block, (0, QK5_0 - len(block)), mode="constant")
|
|
|
|
# Calculate scale and min
|
|
vmin = block.min()
|
|
vmax = block.max()
|
|
scale = (vmax - vmin) / 31.0 if vmax > vmin else 1.0
|
|
|
|
# Quantise to 5-bit (0-31)
|
|
quantised = np.round((block - vmin) / scale).astype(np.uint8)
|
|
quantised = np.clip(quantised, 0, 31)
|
|
|
|
# Pack scale and min
|
|
output.extend(struct.pack("e", scale))
|
|
output.extend(struct.pack("e", vmin))
|
|
|
|
# Pack 5-bit values (simplified packing - not optimal but functional)
|
|
# For simplicity, use 1 byte per value (wasting 3 bits each)
|
|
# Proper implementation would pack 8 values into 5 bytes
|
|
for q in quantised:
|
|
output.append(q)
|
|
|
|
# Pad to expected size
|
|
while len(output) % 24 != 0:
|
|
output.append(0)
|
|
|
|
return bytes(output)
|
|
|
|
def _quantise_q4_0(self, arr: np.ndarray) -> bytes:
|
|
"""Quantise to Q4_0 format.
|
|
|
|
Q4_0: Blocks of 32 values with 4-bit quantisation
|
|
- 1 float16 scale (2 bytes)
|
|
- 1 float16 min value (2 bytes)
|
|
- 16 bytes of packed 4-bit values (32 values * 4 bits = 128 bits = 16 bytes)
|
|
Total: 20 bytes per 32 values
|
|
|
|
Returns:
|
|
Bytes of the quantised data
|
|
"""
|
|
n = len(arr)
|
|
nb = (n + QK4_0 - 1) // QK4_0
|
|
|
|
output = bytearray()
|
|
|
|
for i in range(nb):
|
|
# Get block
|
|
start = i * QK4_0
|
|
end = min(start + QK4_0, n)
|
|
block = arr[start:end]
|
|
|
|
# Pad if needed
|
|
if len(block) < QK4_0:
|
|
block = np.pad(block, (0, QK4_0 - len(block)), mode="constant")
|
|
|
|
# Calculate scale and min
|
|
vmin = block.min()
|
|
vmax = block.max()
|
|
scale = (vmax - vmin) / 15.0 if vmax > vmin else 1.0
|
|
|
|
# Quantise to 4-bit (0-15)
|
|
quantised = np.round((block - vmin) / scale).astype(np.uint8)
|
|
quantised = np.clip(quantised, 0, 15)
|
|
|
|
# Pack scale and min
|
|
output.extend(struct.pack("e", scale))
|
|
output.extend(struct.pack("e", vmin))
|
|
|
|
# Pack 4-bit values - 2 values per byte
|
|
for j in range(0, 32, 2):
|
|
packed = (quantised[j] & 0xF) | ((quantised[j + 1] & 0xF) << 4)
|
|
output.append(packed)
|
|
|
|
return bytes(output)
|
|
|
|
def try_alternative_quantisation(
|
|
self,
|
|
input_path: Path,
|
|
output_path: Path,
|
|
target_type: str,
|
|
) -> bool:
|
|
"""Try basic quantisation for unsupported architectures.
|
|
|
|
For architectures not supported by llama.cpp, uses GGML implementation
|
|
to provide basic quantisation formats as fallback. Handles only basic
|
|
types that can be generated with numpy-based GGML quantisation.
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
# Only handle basic types that we can generate with GGML
|
|
basic_types = ["Q4_0", "Q5_0", "Q6_0", "Q8_0"]
|
|
|
|
if target_type in basic_types:
|
|
logger.info(f"📝 Using GGML numpy implementation for {target_type}")
|
|
return self.quantise_basic(input_path, output_path, target_type)
|
|
|
|
# For K-quants on unsupported architectures, we can't provide a direct equivalent
|
|
logger.error(f"❌ Cannot quantise {target_type} for unsupported architecture")
|
|
logger.info("💡 Consider using Q4_0, Q5_0, Q6_0, or Q8_0 instead")
|
|
return False
|