# llama-server configuration -- EnvironmentFile for llama-server.service
# After changes:  systemctl restart llama-server

# Base directory for GGUF model files.  Configurable; on some deployments
# this may point to a path under /home or a separate mount.
LLAMA_MODELS_DIR=/var/lib/llama-cpp/models

# Path to the GGUF model to load (MUST be set before starting the service).
LLAMA_MODEL=${LLAMA_MODELS_DIR}/model.gguf

# Bind address -- keep internal, control access via nftables or reverse proxy.
LLAMA_HOST=127.0.0.1
LLAMA_PORT=8080

# CPU threads -- conservative default of 2.  MUST be adjusted to the machine!
# Rule of thumb: number of physical cores (NOT SMT/hyperthreads).  llama.cpp
# scales better with physical cores than with SMT.  Use llama-bench to find
# the sweet spot.
LLAMA_THREADS=2

# Context size in tokens.  Larger = more RAM and slower inference.
LLAMA_CTX=8192

# Extra arguments passed to llama-server, for example:
#   --parallel 4         (multiple concurrent slots)
#   --batch-size 512
#   --flash-attn
#   --numa distribute    (NUMA-aware scheduling on multi-socket / EPYC)
LLAMA_EXTRA_ARGS=