"""Quantisation configuration definitions. Pre-defined quantisation configurations for the Bartowski method, supporting Q4_K_M, Q4_K_L, Q4_K_XL, and Q4_K_XXL variants with tensor-level precision control. """ from __future__ import annotations from helpers.models.quantisation import QuantisationConfig, QuantisationType QUANTISATION_CONFIGS: dict[QuantisationType, QuantisationConfig] = { QuantisationType.Q4_K_M: QuantisationConfig( name="Q4_K_M", description="Standard Q4_K_M quantisation (baseline)", tensor_types={}, # No special tensor overrides - uses default Q4_K_M fallback_methods=[], ), QuantisationType.Q4_K_L: QuantisationConfig( name="Q4_K_L", description="Q6_K embeddings + Q6_K attention (+753MB for vocab + reasoning)", tensor_types={ "token_embd.weight": "Q6_K", "output.weight": "Q6_K", "lm_head.weight": "Q6_K", "blk.*.attn_q.weight": "Q6_K", "blk.*.attn_k.weight": "Q6_K", "blk.*.attn_v.weight": "Q6_K", }, fallback_methods=[ { "embed_tokens.weight": "Q6_K", "output.weight": "Q6_K", "lm_head.weight": "Q6_K", "blk.*.attn_q.weight": "Q6_K", "blk.*.attn_k.weight": "Q6_K", "blk.*.attn_v.weight": "Q6_K", }, {"token-embedding-type": "Q6_K", "output-tensor-type": "Q6_K"}, ], ), QuantisationType.Q4_K_XL: QuantisationConfig( name="Q4_K_XL", description="Q8_0 embeddings + Q6_K attention (+2.1GB for vocabulary + reasoning)", tensor_types={ "token_embd.weight": "Q8_0", "output.weight": "Q8_0", "lm_head.weight": "Q8_0", "blk.*.attn_q.weight": "Q6_K", "blk.*.attn_k.weight": "Q6_K", "blk.*.attn_v.weight": "Q6_K", }, fallback_methods=[ { "embed_tokens.weight": "Q8_0", "output.weight": "Q8_0", "lm_head.weight": "Q8_0", "blk.*.attn_q.weight": "Q6_K", "blk.*.attn_k.weight": "Q6_K", "blk.*.attn_v.weight": "Q6_K", }, {"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"}, ], ), QuantisationType.Q4_K_XXL: QuantisationConfig( name="Q4_K_XXL", description="Q8_0 embeddings + Q8_0 attention (+2.8GB total, maximum precision)", tensor_types={ "token_embd.weight": "Q8_0", "output.weight": "Q8_0", "lm_head.weight": "Q8_0", "blk.*.attn_q.weight": "Q8_0", "blk.*.attn_k.weight": "Q8_0", "blk.*.attn_v.weight": "Q8_0", }, fallback_methods=[ { "embed_tokens.weight": "Q8_0", "output.weight": "Q8_0", "lm_head.weight": "Q8_0", "blk.*.attn_q.weight": "Q8_0", "blk.*.attn_k.weight": "Q8_0", "blk.*.attn_v.weight": "Q8_0", }, {"token-embedding-type": "Q8_0", "output-tensor-type": "Q8_0"}, ], ), } SUPPORTED_QUANTISATION_TYPES: list[QuantisationType] = [ QuantisationType.Q4_K_M, QuantisationType.Q4_K_L, QuantisationType.Q4_K_XL, QuantisationType.Q4_K_XXL, ]