llm-gguf-tools/helpers/config/quantisation_configs.py

202 lines
6.1 KiB
Python

"""Quantisation configuration definitions.
Comprehensive quantisation configurations supporting Q2-Q8 and F32, including
standard profiles and custom Bartowski method variants with tensor-level precision
control. Allows flexible combinations of base quantisation with tensor-specific
overrides for embeddings, attention, and feed-forward layers.
"""
from __future__ import annotations
from helpers.models.quantisation import QuantisationConfig, QuantisationType
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
# Standard quantisation profiles
QuantisationType.Q2_K: QuantisationConfig(
name="Q2_K",
description="Q2_K quantisation (smallest, lowest quality)",
base_precision=2,
base_type="Q2_K",
),
QuantisationType.Q2_K_S: QuantisationConfig(
name="Q2_K_S",
description="Q2_K_S quantisation (small variant)",
base_precision=2,
base_type="Q2_K_S",
),
QuantisationType.Q3_K_S: QuantisationConfig(
name="Q3_K_S",
description="Q3_K_S quantisation (small variant)",
base_precision=3,
base_type="Q3_K_S",
),
QuantisationType.Q3_K_M: QuantisationConfig(
name="Q3_K_M",
description="Q3_K_M quantisation (medium variant)",
base_precision=3,
base_type="Q3_K_M",
inherent_enhancements={
"embeddings": "Q6_K",
"attention_v": "Q5_K",
"ffn_down": "Q4_K",
},
),
QuantisationType.Q3_K_L: QuantisationConfig(
name="Q3_K_L",
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
base_type="Q3_K_M",
base_precision=3,
output_type="Q5_K",
),
QuantisationType.Q3_K_XL: QuantisationConfig(
name="Q3_K_XL",
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
base_type="Q3_K_M",
base_precision=3,
embedding_type="Q8_0",
output_type="Q6_K",
),
QuantisationType.Q4_K_S: QuantisationConfig(
name="Q4_K_S",
description="Q4_K_S quantisation (small variant)",
base_precision=4,
base_type="Q4_K_S",
),
QuantisationType.Q4_K_M: QuantisationConfig(
name="Q4_K_M",
description="Standard Q4_K_M quantisation (baseline)",
base_precision=4,
base_type="Q4_K_M",
inherent_enhancements={
"embeddings": "Q6_K",
"attention_v": "Q6_K",
"ffn_down": "Q6_K",
},
),
QuantisationType.Q4_K_L: QuantisationConfig(
name="Q4_K_L",
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
base_type="Q4_K_M",
base_precision=4,
embedding_type="Q8_0",
),
# Additional standard quantisation profiles
QuantisationType.Q5_K_S: QuantisationConfig(
name="Q5_K_S",
description="Q5_K_S quantisation (small variant, better than Q4)",
base_precision=5,
base_type="Q5_K_S",
),
QuantisationType.Q5_K_M: QuantisationConfig(
name="Q5_K_M",
description="Q5_K_M quantisation (medium variant, balanced quality)",
base_precision=5,
base_type="Q5_K_M",
inherent_enhancements={
"embeddings": "Q6_K",
"attention_v": "Q6_K",
"ffn_down": "Q6_K",
},
),
QuantisationType.Q5_K_L: QuantisationConfig(
name="Q5_K_L",
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
base_type="Q5_K_M",
base_precision=5,
embedding_type="Q8_0",
),
QuantisationType.Q6_K: QuantisationConfig(
name="Q6_K",
description="Q6_K quantisation (high quality, larger size)",
base_precision=6,
base_type="Q6_K",
inherent_enhancements={
"embeddings": "Q8_0",
"attention_v": "Q8_0",
"ffn_down": "Q6_K",
},
),
QuantisationType.Q6_K_L: QuantisationConfig(
name="Q6_K_L",
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
base_type="Q6_K",
base_precision=6,
output_type="Q8_0",
),
QuantisationType.Q8_0: QuantisationConfig(
name="Q8_0",
description="Q8_0 quantisation (highest quality, largest size)",
base_precision=8,
base_type="Q8_0",
),
# Legacy formats
QuantisationType.Q4_0: QuantisationConfig(
name="Q4_0",
description="Legacy Q4_0 quantisation",
base_precision=4,
base_type="Q4_0",
),
QuantisationType.Q4_1: QuantisationConfig(
name="Q4_1",
description="Legacy Q4_1 quantisation",
base_precision=4,
base_type="Q4_1",
),
QuantisationType.Q5_0: QuantisationConfig(
name="Q5_0",
description="Legacy Q5_0 quantisation",
base_precision=5,
base_type="Q5_0",
),
QuantisationType.Q5_1: QuantisationConfig(
name="Q5_1",
description="Legacy Q5_1 quantisation",
base_precision=5,
base_type="Q5_1",
),
}
# Default profile set for optimal quality/size balance
DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
QuantisationType.Q8_0,
]
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
# Q2 variants
QuantisationType.Q2_K,
QuantisationType.Q2_K_S,
# Q3 K-quants
QuantisationType.Q3_K_S,
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
# Q4 K-quants
QuantisationType.Q4_K_S,
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
# Q5 K-quants
QuantisationType.Q5_K_S,
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
# Q6_K
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
# Q8_0
QuantisationType.Q8_0,
# Legacy formats
QuantisationType.Q4_0,
QuantisationType.Q4_1,
QuantisationType.Q5_0,
QuantisationType.Q5_1,
]