llm-gguf-tools/helpers/config/quantisation_configs.py
2025-08-09 12:58:58 +01:00

238 lines
7.2 KiB
Python

"""Quantisation configuration definitions.
Comprehensive quantisation configurations supporting Q2-Q8 and F32, including
standard profiles and custom Bartowski method variants with tensor-level precision
control. Allows flexible combinations of base quantisation with tensor-specific
overrides for embeddings, attention, and feed-forward layers.
"""
from __future__ import annotations
from helpers.models.quantisation import QuantisationConfig, QuantisationType
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
# Basic quantisation profiles
QuantisationType.Q2_0: QuantisationConfig(
name="Q2_0",
description="Basic Q2_0 quantisation (2-bit, smallest)",
base_precision=2,
base_type="Q2_0",
),
QuantisationType.Q3_0: QuantisationConfig(
name="Q3_0",
description="Basic Q3_0 quantisation (3-bit)",
base_precision=3,
base_type="Q3_0",
),
# Standard quantisation profiles
QuantisationType.Q2_K: QuantisationConfig(
name="Q2_K",
description="Q2_K quantisation (smallest, lowest quality)",
base_precision=2,
base_type="Q2_K",
),
QuantisationType.Q2_K_S: QuantisationConfig(
name="Q2_K_S",
description="Q2_K_S quantisation (small variant)",
base_precision=2,
base_type="Q2_K_S",
),
QuantisationType.Q3_K_S: QuantisationConfig(
name="Q3_K_S",
description="Q3_K_S quantisation (small variant)",
base_precision=3,
base_type="Q3_K_S",
),
QuantisationType.Q3_K_M: QuantisationConfig(
name="Q3_K_M",
description="Q3_K_M quantisation (medium variant)",
base_precision=3,
base_type="Q3_K_M",
inherent_enhancements={
"embeddings": "Q6_K",
"attention_v": "Q5_K",
"ffn_down": "Q4_K",
},
),
QuantisationType.Q3_K_L: QuantisationConfig(
name="Q3_K_L",
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
base_type="Q3_K_M",
base_precision=3,
output_type="q5_k",
),
QuantisationType.Q3_K_XL: QuantisationConfig(
name="Q3_K_XL",
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
base_type="Q3_K_M",
base_precision=3,
embedding_type="q8_0",
output_type="q6_k",
),
QuantisationType.Q4_K_S: QuantisationConfig(
name="Q4_K_S",
description="Q4_K_S quantisation (small variant)",
base_precision=4,
base_type="Q4_K_S",
),
QuantisationType.Q4_K_M: QuantisationConfig(
name="Q4_K_M",
description="Standard Q4_K_M quantisation (baseline)",
base_precision=4,
base_type="Q4_K_M",
inherent_enhancements={
"embeddings": "Q6_K",
"attention_v": "Q6_K",
"ffn_down": "Q6_K",
},
),
QuantisationType.Q4_K_L: QuantisationConfig(
name="Q4_K_L",
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
base_type="Q4_K_M",
base_precision=4,
embedding_type="q8_0",
),
# Additional standard quantisation profiles
QuantisationType.Q5_K_S: QuantisationConfig(
name="Q5_K_S",
description="Q5_K_S quantisation (small variant, better than Q4)",
base_precision=5,
base_type="Q5_K_S",
),
QuantisationType.Q5_K_M: QuantisationConfig(
name="Q5_K_M",
description="Q5_K_M quantisation (medium variant, balanced quality)",
base_precision=5,
base_type="Q5_K_M",
inherent_enhancements={
"embeddings": "Q6_K",
"attention_v": "Q6_K",
"ffn_down": "Q6_K",
},
),
QuantisationType.Q5_K_L: QuantisationConfig(
name="Q5_K_L",
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
base_type="Q5_K_M",
base_precision=5,
embedding_type="q8_0",
),
QuantisationType.Q6_0: QuantisationConfig(
name="Q6_0",
description="Basic Q6_0 quantisation (6-bit)",
base_precision=6,
base_type="Q6_0",
),
QuantisationType.Q6_K: QuantisationConfig(
name="Q6_K",
description="Q6_K quantisation (high quality, larger size)",
base_precision=6,
base_type="Q6_K",
inherent_enhancements={
"embeddings": "Q8_0",
"attention_v": "Q8_0",
"ffn_down": "Q6_K",
},
),
QuantisationType.Q6_K_L: QuantisationConfig(
name="Q6_K_L",
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
base_type="Q6_K",
base_precision=6,
output_type="q8_0",
),
QuantisationType.Q8_K: QuantisationConfig(
name="Q8_K",
description="Q8_K quantisation (highest quality, largest size)",
base_precision=8,
base_type="Q8_K",
),
QuantisationType.Q8_0: QuantisationConfig(
name="Q8_0",
description="Basic Q8_0 quantisation (8-bit flat)",
base_precision=8,
base_type="Q8_0",
),
# Legacy formats
QuantisationType.Q4_0: QuantisationConfig(
name="Q4_0",
description="Legacy Q4_0 quantisation",
base_precision=4,
base_type="Q4_0",
),
QuantisationType.Q4_1: QuantisationConfig(
name="Q4_1",
description="Legacy Q4_1 quantisation",
base_precision=4,
base_type="Q4_1",
),
QuantisationType.Q5_0: QuantisationConfig(
name="Q5_0",
description="Legacy Q5_0 quantisation",
base_precision=5,
base_type="Q5_0",
),
QuantisationType.Q5_1: QuantisationConfig(
name="Q5_1",
description="Legacy Q5_1 quantisation",
base_precision=5,
base_type="Q5_1",
),
}
DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [
# Q3 variants (smallest)
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
# Q4 variants
QuantisationType.Q4_0, # Basic - always available
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
# Q5 variants
QuantisationType.Q5_0, # Basic - always available
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
# Q6 variants
QuantisationType.Q6_0, # Basic - always available
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
# Q8 variants (largest)
QuantisationType.Q8_0, # Basic - always available
QuantisationType.Q8_K,
]
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
# Q2 variants
QuantisationType.Q2_0,
QuantisationType.Q2_K,
QuantisationType.Q2_K_S,
# Q3 K-quants
QuantisationType.Q3_0,
QuantisationType.Q3_K_S,
QuantisationType.Q3_K_M,
QuantisationType.Q3_K_L,
QuantisationType.Q3_K_XL,
# Q4 K-quants
QuantisationType.Q4_0,
QuantisationType.Q4_1,
QuantisationType.Q4_K_S,
QuantisationType.Q4_K_M,
QuantisationType.Q4_K_L,
# Q5 K-quants
QuantisationType.Q5_0,
QuantisationType.Q5_1,
QuantisationType.Q5_K_S,
QuantisationType.Q5_K_M,
QuantisationType.Q5_K_L,
# Q6_K
QuantisationType.Q6_0,
QuantisationType.Q6_K,
QuantisationType.Q6_K_L,
# Q8_0
QuantisationType.Q8_0,
QuantisationType.Q8_K,
]