"""Quantisation configuration definitions. Comprehensive quantisation configurations supporting Q2-Q8 and F32, including standard profiles and custom Bartowski method variants with tensor-level precision control. Allows flexible combinations of base quantisation with tensor-specific overrides for embeddings, attention, and feed-forward layers. """ from __future__ import annotations from helpers.models.quantisation import QuantisationConfig, QuantisationType QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { # Standard quantisation profiles QuantisationType.Q2_K: QuantisationConfig( name="Q2_K", description="Q2_K quantisation (smallest, lowest quality)", base_precision=2, base_type="Q2_K", ), QuantisationType.Q2_K_S: QuantisationConfig( name="Q2_K_S", description="Q2_K_S quantisation (small variant)", base_precision=2, base_type="Q2_K_S", ), QuantisationType.Q3_K_S: QuantisationConfig( name="Q3_K_S", description="Q3_K_S quantisation (small variant)", base_precision=3, base_type="Q3_K_S", ), QuantisationType.Q3_K_M: QuantisationConfig( name="Q3_K_M", description="Q3_K_M quantisation (medium variant)", base_precision=3, base_type="Q3_K_M", inherent_enhancements={ "embeddings": "Q6_K", "attention_v": "Q5_K", "ffn_down": "Q4_K", }, ), QuantisationType.Q3_K_L: QuantisationConfig( name="Q3_K_L", description="Bartowski Q3_K_L: Q3_K_M base with Q5_K output", base_type="Q3_K_M", base_precision=3, output_type="Q5_K", ), QuantisationType.Q3_K_XL: QuantisationConfig( name="Q3_K_XL", description="Bartowski Q3_K_XL: Q3_K_M base with Q8_0 embeddings + Q6_K output", base_type="Q3_K_M", base_precision=3, embedding_type="Q8_0", output_type="Q6_K", ), QuantisationType.Q4_K_S: QuantisationConfig( name="Q4_K_S", description="Q4_K_S quantisation (small variant)", base_precision=4, base_type="Q4_K_S", ), QuantisationType.Q4_K_M: QuantisationConfig( name="Q4_K_M", description="Standard Q4_K_M quantisation (baseline)", base_precision=4, base_type="Q4_K_M", inherent_enhancements={ "embeddings": "Q6_K", "attention_v": "Q6_K", "ffn_down": "Q6_K", }, ), QuantisationType.Q4_K_L: QuantisationConfig( name="Q4_K_L", description="Bartowski Q4_K_L: Q4_K_M base with Q8_0 embeddings", base_type="Q4_K_M", base_precision=4, embedding_type="Q8_0", ), # Additional standard quantisation profiles QuantisationType.Q5_K_S: QuantisationConfig( name="Q5_K_S", description="Q5_K_S quantisation (small variant, better than Q4)", base_precision=5, base_type="Q5_K_S", ), QuantisationType.Q5_K_M: QuantisationConfig( name="Q5_K_M", description="Q5_K_M quantisation (medium variant, balanced quality)", base_precision=5, base_type="Q5_K_M", inherent_enhancements={ "embeddings": "Q6_K", "attention_v": "Q6_K", "ffn_down": "Q6_K", }, ), QuantisationType.Q5_K_L: QuantisationConfig( name="Q5_K_L", description="Bartowski Q5_K_L: Q5_K_M base with Q8_0 embeddings", base_type="Q5_K_M", base_precision=5, embedding_type="Q8_0", ), QuantisationType.Q6_K: QuantisationConfig( name="Q6_K", description="Q6_K quantisation (high quality, larger size)", base_precision=6, base_type="Q6_K", inherent_enhancements={ "embeddings": "Q8_0", "attention_v": "Q8_0", "ffn_down": "Q6_K", }, ), QuantisationType.Q6_K_L: QuantisationConfig( name="Q6_K_L", description="Bartowski Q6_K_L: Q6_K base with Q8_0 output", base_type="Q6_K", base_precision=6, output_type="Q8_0", ), QuantisationType.Q8_0: QuantisationConfig( name="Q8_0", description="Q8_0 quantisation (highest quality, largest size)", base_precision=8, base_type="Q8_0", ), # Legacy formats QuantisationType.Q4_0: QuantisationConfig( name="Q4_0", description="Legacy Q4_0 quantisation", base_precision=4, base_type="Q4_0", ), QuantisationType.Q4_1: QuantisationConfig( name="Q4_1", description="Legacy Q4_1 quantisation", base_precision=4, base_type="Q4_1", ), QuantisationType.Q5_0: QuantisationConfig( name="Q5_0", description="Legacy Q5_0 quantisation", base_precision=5, base_type="Q5_0", ), QuantisationType.Q5_1: QuantisationConfig( name="Q5_1", description="Legacy Q5_1 quantisation", base_precision=5, base_type="Q5_1", ), } # Default profile set for optimal quality/size balance DEFAULT_QUANTISATION_TYPES: list[QuantisationType] = [ QuantisationType.Q3_K_M, QuantisationType.Q3_K_L, QuantisationType.Q3_K_XL, QuantisationType.Q4_K_M, QuantisationType.Q4_K_L, QuantisationType.Q5_K_M, QuantisationType.Q5_K_L, QuantisationType.Q6_K, QuantisationType.Q6_K_L, QuantisationType.Q8_0, ] SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [ # Q2 variants QuantisationType.Q2_K, QuantisationType.Q2_K_S, # Q3 K-quants QuantisationType.Q3_K_S, QuantisationType.Q3_K_M, QuantisationType.Q3_K_L, QuantisationType.Q3_K_XL, # Q4 K-quants QuantisationType.Q4_K_S, QuantisationType.Q4_K_M, QuantisationType.Q4_K_L, # Q5 K-quants QuantisationType.Q5_K_S, QuantisationType.Q5_K_M, QuantisationType.Q5_K_L, # Q6_K QuantisationType.Q6_K, QuantisationType.Q6_K_L, # Q8_0 QuantisationType.Q8_0, # Legacy formats QuantisationType.Q4_0, QuantisationType.Q4_1, QuantisationType.Q5_0, QuantisationType.Q5_1, ]