202 lines
6.1 KiB
Python
202 lines
6.1 KiB
Python
"""Quantisation configuration definitions.
|
|
|
|
Comprehensive quantisation configurations supporting Q2-Q8 and F32, including
|
|
standard profiles and custom Bartowski method variants with tensor-level precision
|
|
control. Allows flexible combinations of base quantisation with tensor-specific
|
|
overrides for embeddings, attention, and feed-forward layers.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from helpers.models.quantisation import QuantisationConfig, QuantisationType
|
|
|
|
QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = {
|
|
# Standard quantisation profiles
|
|
QuantisationType.Q2_K: QuantisationConfig(
|
|
name="Q2_K",
|
|
description="Q2_K quantisation (smallest, lowest quality)",
|
|
base_precision=2,
|
|
base_type="Q2_K",
|
|
),
|
|
QuantisationType.Q2_K_S: QuantisationConfig(
|
|
name="Q2_K_S",
|
|
description="Q2_K_S quantisation (small variant)",
|
|
base_precision=2,
|
|
base_type="Q2_K_S",
|
|
),
|
|
QuantisationType.Q3_K_S: QuantisationConfig(
|
|
name="Q3_K_S",
|
|
description="Q3_K_S quantisation (small variant)",
|
|
base_precision=3,
|
|
base_type="Q3_K_S",
|
|
),
|
|
QuantisationType.Q3_K_M: QuantisationConfig(
|
|
name="Q3_K_M",
|
|
description="Q3_K_M quantisation (medium variant)",
|
|
base_precision=3,
|
|
base_type="Q3_K_M",
|
|
inherent_enhancements={
|
|
"embeddings": "Q6_K",
|
|
"attention_v": "Q5_K",
|
|
"ffn_down": "Q4_K",
|
|
},
|
|
),
|
|
QuantisationType.Q3_K_L: QuantisationConfig(
|
|
name="Q3_K_L",
|
|
description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output",
|
|
base_type="Q3_K_M",
|
|
base_precision=3,
|
|
output_type="Q5_K",
|
|
),
|
|
QuantisationType.Q3_K_XL: QuantisationConfig(
|
|
name="Q3_K_XL",
|
|
description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output",
|
|
base_type="Q3_K_M",
|
|
base_precision=3,
|
|
embedding_type="Q8_0",
|
|
output_type="Q6_K",
|
|
),
|
|
QuantisationType.Q4_K_S: QuantisationConfig(
|
|
name="Q4_K_S",
|
|
description="Q4_K_S quantisation (small variant)",
|
|
base_precision=4,
|
|
base_type="Q4_K_S",
|
|
),
|
|
QuantisationType.Q4_K_M: QuantisationConfig(
|
|
name="Q4_K_M",
|
|
description="Standard Q4_K_M quantisation (baseline)",
|
|
base_precision=4,
|
|
base_type="Q4_K_M",
|
|
inherent_enhancements={
|
|
"embeddings": "Q6_K",
|
|
"attention_v": "Q6_K",
|
|
"ffn_down": "Q6_K",
|
|
},
|
|
),
|
|
QuantisationType.Q4_K_L: QuantisationConfig(
|
|
name="Q4_K_L",
|
|
description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings",
|
|
base_type="Q4_K_M",
|
|
base_precision=4,
|
|
embedding_type="Q8_0",
|
|
),
|
|
# Additional standard quantisation profiles
|
|
QuantisationType.Q5_K_S: QuantisationConfig(
|
|
name="Q5_K_S",
|
|
description="Q5_K_S quantisation (small variant, better than Q4)",
|
|
base_precision=5,
|
|
base_type="Q5_K_S",
|
|
),
|
|
QuantisationType.Q5_K_M: QuantisationConfig(
|
|
name="Q5_K_M",
|
|
description="Q5_K_M quantisation (medium variant, balanced quality)",
|
|
base_precision=5,
|
|
base_type="Q5_K_M",
|
|
inherent_enhancements={
|
|
"embeddings": "Q6_K",
|
|
"attention_v": "Q6_K",
|
|
"ffn_down": "Q6_K",
|
|
},
|
|
),
|
|
QuantisationType.Q5_K_L: QuantisationConfig(
|
|
name="Q5_K_L",
|
|
description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings",
|
|
base_type="Q5_K_M",
|
|
base_precision=5,
|
|
embedding_type="Q8_0",
|
|
),
|
|
QuantisationType.Q6_K: QuantisationConfig(
|
|
name="Q6_K",
|
|
description="Q6_K quantisation (high quality, larger size)",
|
|
base_precision=6,
|
|
base_type="Q6_K",
|
|
inherent_enhancements={
|
|
"embeddings": "Q8_0",
|
|
"attention_v": "Q8_0",
|
|
"ffn_down": "Q6_K",
|
|
},
|
|
),
|
|
QuantisationType.Q6_K_L: QuantisationConfig(
|
|
name="Q6_K_L",
|
|
description="Bartowski Q6_K_L: Q6_K base with Q8_0 output",
|
|
base_type="Q6_K",
|
|
base_precision=6,
|
|
output_type="Q8_0",
|
|
),
|
|
QuantisationType.Q8_0: QuantisationConfig(
|
|
name="Q8_0",
|
|
description="Q8_0 quantisation (highest quality, largest size)",
|
|
base_precision=8,
|
|
base_type="Q8_0",
|
|
),
|
|
# Legacy formats
|
|
QuantisationType.Q4_0: QuantisationConfig(
|
|
name="Q4_0",
|
|
description="Legacy Q4_0 quantisation",
|
|
base_precision=4,
|
|
base_type="Q4_0",
|
|
),
|
|
QuantisationType.Q4_1: QuantisationConfig(
|
|
name="Q4_1",
|
|
description="Legacy Q4_1 quantisation",
|
|
base_precision=4,
|
|
base_type="Q4_1",
|
|
),
|
|
QuantisationType.Q5_0: QuantisationConfig(
|
|
name="Q5_0",
|
|
description="Legacy Q5_0 quantisation",
|
|
base_precision=5,
|
|
base_type="Q5_0",
|
|
),
|
|
QuantisationType.Q5_1: QuantisationConfig(
|
|
name="Q5_1",
|
|
description="Legacy Q5_1 quantisation",
|
|
base_precision=5,
|
|
base_type="Q5_1",
|
|
),
|
|
}
|
|
|
|
|
|
# Default profile set for optimal quality/size balance
|
|
DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [
|
|
QuantisationType.Q3_K_M,
|
|
QuantisationType.Q3_K_L,
|
|
QuantisationType.Q3_K_XL,
|
|
QuantisationType.Q4_K_M,
|
|
QuantisationType.Q4_K_L,
|
|
QuantisationType.Q5_K_M,
|
|
QuantisationType.Q5_K_L,
|
|
QuantisationType.Q6_K,
|
|
QuantisationType.Q6_K_L,
|
|
QuantisationType.Q8_0,
|
|
]
|
|
|
|
|
|
SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [
|
|
# Q2 variants
|
|
QuantisationType.Q2_K,
|
|
QuantisationType.Q2_K_S,
|
|
# Q3 K-quants
|
|
QuantisationType.Q3_K_S,
|
|
QuantisationType.Q3_K_M,
|
|
QuantisationType.Q3_K_L,
|
|
QuantisationType.Q3_K_XL,
|
|
# Q4 K-quants
|
|
QuantisationType.Q4_K_S,
|
|
QuantisationType.Q4_K_M,
|
|
QuantisationType.Q4_K_L,
|
|
# Q5 K-quants
|
|
QuantisationType.Q5_K_S,
|
|
QuantisationType.Q5_K_M,
|
|
QuantisationType.Q5_K_L,
|
|
# Q6_K
|
|
QuantisationType.Q6_K,
|
|
QuantisationType.Q6_K_L,
|
|
# Q8_0
|
|
QuantisationType.Q8_0,
|
|
# Legacy formats
|
|
QuantisationType.Q4_0,
|
|
QuantisationType.Q4_1,
|
|
QuantisationType.Q5_0,
|
|
QuantisationType.Q5_1,
|
|
]
|