95 lines
3.3 KiB
Python
95 lines
3.3 KiB
Python
"""Quantisation configuration definitions.
|
|
|
|
Pre-defined quantisation configurations for the Bartowski method, supporting
|
|
Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with tensor-level precision control.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from helpers.models.quantisation import QuantisationConfig, QuantisationType
|
|
|
|
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|
QuantisationType.Q4_K_M: QuantisationConfig(
|
|
name="Q4_K_M",
|
|
description="Standard Q4_K_M quantisation (baseline)",
|
|
tensor_types={}, # No special tensor overrides - uses default Q4_K_M
|
|
fallback_methods=[],
|
|
),
|
|
QuantisationType.Q4_K_L: QuantisationConfig(
|
|
name="Q4_K_L",
|
|
description="Q6_K embeddings + Q6_K attention (+753MB for vocab + reasoning)",
|
|
tensor_types={
|
|
"token_embd.weight": "Q6_K",
|
|
"output.weight": "Q6_K",
|
|
"lm_head.weight": "Q6_K",
|
|
"blk.*.attn_q.weight": "Q6_K",
|
|
"blk.*.attn_k.weight": "Q6_K",
|
|
"blk.*.attn_v.weight": "Q6_K",
|
|
},
|
|
fallback_methods=[
|
|
{
|
|
"embed_tokens.weight": "Q6_K",
|
|
"output.weight": "Q6_K",
|
|
"lm_head.weight": "Q6_K",
|
|
"blk.*.attn_q.weight": "Q6_K",
|
|
"blk.*.attn_k.weight": "Q6_K",
|
|
"blk.*.attn_v.weight": "Q6_K",
|
|
},
|
|
{"token-embedding-type": "Q6_K", "output-tensor-type": "Q6_K"},
|
|
],
|
|
),
|
|
QuantisationType.Q4_K_XL: QuantisationConfig(
|
|
name="Q4_K_XL",
|
|
description="Q8_0 embeddings + Q6_K attention (+2.1GB for vocabulary + reasoning)",
|
|
tensor_types={
|
|
"token_embd.weight": "Q8_0",
|
|
"output.weight": "Q8_0",
|
|
"lm_head.weight": "Q8_0",
|
|
"blk.*.attn_q.weight": "Q6_K",
|
|
"blk.*.attn_k.weight": "Q6_K",
|
|
"blk.*.attn_v.weight": "Q6_K",
|
|
},
|
|
fallback_methods=[
|
|
{
|
|
"embed_tokens.weight": "Q8_0",
|
|
"output.weight": "Q8_0",
|
|
"lm_head.weight": "Q8_0",
|
|
"blk.*.attn_q.weight": "Q6_K",
|
|
"blk.*.attn_k.weight": "Q6_K",
|
|
"blk.*.attn_v.weight": "Q6_K",
|
|
},
|
|
{"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
|
|
],
|
|
),
|
|
QuantisationType.Q4_K_XXL: QuantisationConfig(
|
|
name="Q4_K_XXL",
|
|
description="Q8_0 embeddings + Q8_0 attention (+2.8GB total, maximum precision)",
|
|
tensor_types={
|
|
"token_embd.weight": "Q8_0",
|
|
"output.weight": "Q8_0",
|
|
"lm_head.weight": "Q8_0",
|
|
"blk.*.attn_q.weight": "Q8_0",
|
|
"blk.*.attn_k.weight": "Q8_0",
|
|
"blk.*.attn_v.weight": "Q8_0",
|
|
},
|
|
fallback_methods=[
|
|
{
|
|
"embed_tokens.weight": "Q8_0",
|
|
"output.weight": "Q8_0",
|
|
"lm_head.weight": "Q8_0",
|
|
"blk.*.attn_q.weight": "Q8_0",
|
|
"blk.*.attn_k.weight": "Q8_0",
|
|
"blk.*.attn_v.weight": "Q8_0",
|
|
},
|
|
{"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"},
|
|
],
|
|
),
|
|
}
|
|
|
|
|
|
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
|
|
QuantisationType.Q4_K_M,
|
|
QuantisationType.Q4_K_L,
|
|
QuantisationType.Q4_K_XL,
|
|
QuantisationType.Q4_K_XXL,
|
|
]
|