llm-gguf-tools/helpers/config/quantisation_configs.py
2025-08-07 18:29:12 +01:00

95 lines
3.3 KiB
Python

"""Quantisation configuration definitions.
Pre-defined quantisation configurations for the Bartowski method, supporting
Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with tensor-level precision control.
"""
from __future__ import annotations
from helpers.models.quantisation import QuantisationConfig, QuantisationType
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
QuantisationType.Q4_K_M: QuantisationConfig(
name="Q4_K_M",
description="Standard Q4_K_M quantisation (baseline)",
tensor_types={}, # No special tensor overrides - uses default Q4_K_M
fallback_methods=[],
),
QuantisationType.Q4_K_L: QuantisationConfig(
name="Q4_K_L",
description="Q6_K embeddings + Q6_K attention (+753MB for vocab + reasoning)",
tensor_types={
"token_embd.weight": "Q6_K",
"output.weight": "Q6_K",
"lm_head.weight": "Q6_K",
"blk.*.attn_q.weight": "Q6_K",
"blk.*.attn_k.weight": "Q6_K",
"blk.*.attn_v.weight": "Q6_K",
},
fallback_methods=[
{
"embed_tokens.weight": "Q6_K",
"output.weight": "Q6_K",
"lm_head.weight": "Q6_K",
"blk.*.attn_q.weight": "Q6_K",
"blk.*.attn_k.weight": "Q6_K",
"blk.*.attn_v.weight": "Q6_K",
},
{"token-embedding-type": "Q6_K", "output-tensor-type": "Q6_K"},
],
),
QuantisationType.Q4_K_XL: QuantisationConfig(
name="Q4_K_XL",
description="Q8_0 embeddings + Q6_K attention (+2.1GB for vocabulary + reasoning)",
tensor_types={
"token_embd.weight": "Q8_0",
"output.weight": "Q8_0",
"lm_head.weight": "Q8_0",
"blk.*.attn_q.weight": "Q6_K",
"blk.*.attn_k.weight": "Q6_K",
"blk.*.attn_v.weight": "Q6_K",
},
fallback_methods=[
{
"embed_tokens.weight": "Q8_0",
"output.weight": "Q8_0",
"lm_head.weight": "Q8_0",
"blk.*.attn_q.weight": "Q6_K",
"blk.*.attn_k.weight": "Q6_K",
"blk.*.attn_v.weight": "Q6_K",
},
{"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
],
),
QuantisationType.Q4_K_XXL: QuantisationConfig(
name="Q4_K_XXL",
description="Q8_0 embeddings + Q8_0 attention (+2.8GB total, maximum precision)",
tensor_types={
"token_embd.weight": "Q8_0",
"output.weight": "Q8_0",
"lm_head.weight": "Q8_0",
"blk.*.attn_q.weight": "Q8_0",
"blk.*.attn_k.weight": "Q8_0",
"blk.*.attn_v.weight": "Q8_0",
},
fallback_methods=[
{
"embed_tokens.weight": "Q8_0",
"output.weight": "Q8_0",
"lm_head.weight": "Q8_0",
"blk.*.attn_q.weight": "Q8_0",
"blk.*.attn_k.weight": "Q8_0",
"blk.*.attn_v.weight": "Q8_0",
},
{"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
],
),
}
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
QuantisationType.Q4_K_XL,
QuantisationType.Q4_K_XXL,
]