# llama-server configuration -- EnvironmentFile for llama-server.service # After changes: systemctl restart llama-server # Base directory for GGUF model files. Configurable; on some deployments # this may point to a path under /home or a separate mount. LLAMA_MODELS_DIR=/var/lib/llama-cpp/models # Path to the GGUF model to load (MUST be set before starting the service). LLAMA_MODEL=${LLAMA_MODELS_DIR}/model.gguf # Bind address -- keep internal, control access via nftables or reverse proxy. LLAMA_HOST=127.0.0.1 LLAMA_PORT=8080 # CPU threads -- conservative default of 2. MUST be adjusted to the machine! # Rule of thumb: number of physical cores (NOT SMT/hyperthreads). llama.cpp # scales better with physical cores than with SMT. Use llama-bench to find # the sweet spot. LLAMA_THREADS=2 # Context size in tokens. Larger = more RAM and slower inference. LLAMA_CTX=8192 # Extra arguments passed to llama-server, for example: # --parallel 4 (multiple concurrent slots) # --batch-size 512 # --flash-attn # --numa distribute (NUMA-aware scheduling on multi-socket / EPYC) LLAMA_EXTRA_ARGS=