gpu-llm-benchmarking/.env.example

# Vast.ai API configuration
VAST_API_KEY="YOUR_API_KEY"  # Your Vast.ai API key for instance management

# GPU instance specifications
GPU_TYPE="RTX 5090"  # GPU type to request (e.g. RTX 4090, RTX 5090, A100)
NUM_GPUS=1  # Number of GPUs to allocate for the instance
DISK_SPACE=30  # Disk space in GB for the instance
REGION=""  # Region to filter for instances (e.g. "PL", "RO", or leave empty for any)

# Benchmark configuration
TEST_ITERATIONS=3  # Number of test runs per context length
CONTEXT_START=8000  # Starting context length for benchmark
CONTEXT_END=128000  # Maximum context length for benchmark
CONTEXT_MULTIPLIER=2  # Multiplier for context length progression

# Ollama configuration
OLLAMA_MODEL="hf.co/unsloth/gemma-3-12b-it-GGUF:Q5_K_XL"  # Model to benchmark
OLLAMA_FLASH_ATTENTION="1"  # Enable flash attention for performance
OLLAMA_KEEP_ALIVE="10m"  # How long to keep model loaded in memory
OLLAMA_KV_CACHE_TYPE="q8_0"  # KV cache quantisation type
OLLAMA_MAX_LOADED_MODELS="1"  # Maximum number of loaded models
OLLAMA_NUM_GPU="1"  # Number of GPUs to use for Ollama
OLLAMA_NUM_PARALLEL="1"  # Number of parallel requests

# Local file transfer settings
LOCAL_RESULTS_DIR="./results"  # Local directory to store benchmark results