# Copyright 2024-2026 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 EAPI=8 ROCM_VERSION=7.1 inherit cmake cuda rocm linux-info systemd TINY_LLAMAS_COMMIT="99dd1a73db5a37100bd4ae633f4cfce6560e1567" DESCRIPTION="LLM inference in C/C++ (GGML/GGUF) — CPU + optional GPU backends" HOMEPAGE="https://github.com/ggml-org/llama.cpp" SRC_URI="https://github.com/ggml-org/llama.cpp/archive/refs/tags/b${PV}.tar.gz -> ${P}.gh.tar.gz" S="${WORKDIR}/llama.cpp-b${PV}" SRC_URI+=" examples? ( https://huggingface.co/ggml-org/tiny-llamas/resolve/${TINY_LLAMAS_COMMIT}/stories15M-q4_0.gguf -> ggml-org_models_tinyllamas_stories15M-q4_0-${TINY_LLAMAS_COMMIT}.gguf ) " LICENSE="MIT" SLOT="0" KEYWORDS="~amd64" # wmma: rocWMMA flash-attention on RDNA3+/CDNA GPUs # see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#hip IUSE=" curl openblas +openmp blis rocm cuda opencl openssl vulkan flexiblas wmma examples cpu_flags_x86_avx cpu_flags_x86_avx2 cpu_flags_x86_fma3 cpu_flags_x86_f16c cpu_flags_x86_bmi2 cpu_flags_x86_avx_vnni cpu_flags_x86_avx512f cpu_flags_x86_avx512vbmi cpu_flags_x86_avx512_vnni cpu_flags_x86_avx512_bf16 " REQUIRED_USE=" ?? ( openblas blis flexiblas ) wmma? ( rocm ) cpu_flags_x86_avx2? ( cpu_flags_x86_avx ) cpu_flags_x86_avx512f? ( cpu_flags_x86_avx2 ) cpu_flags_x86_avx512vbmi? ( cpu_flags_x86_avx512f ) cpu_flags_x86_avx512_vnni? ( cpu_flags_x86_avx512f ) cpu_flags_x86_avx512_bf16? ( cpu_flags_x86_avx512f ) " # curl: needed for pulling models from huggingface # numpy: used by convert_hf_to_gguf.py CDEPEND=" curl? ( net-misc/curl:= ) openblas? ( sci-libs/openblas:= ) openmp? ( llvm-runtimes/openmp:= ) blis? ( sci-libs/blis:= ) flexiblas? ( sci-libs/flexiblas:= ) rocm? ( >=dev-util/hip-${ROCM_VERSION}:= >=sci-libs/hipBLAS-${ROCM_VERSION}:= wmma? ( >=sci-libs/rocWMMA-${ROCM_VERSION}:= ) ) cuda? ( dev-util/nvidia-cuda-toolkit:= ) openssl? ( dev-libs/openssl:= ) " DEPEND="${CDEPEND} opencl? ( dev-util/opencl-headers ) vulkan? ( dev-util/spirv-headers dev-util/vulkan-headers ) " RDEPEND="${CDEPEND} dev-python/numpy opencl? ( dev-libs/opencl-icd-loader ) vulkan? ( media-libs/vulkan-loader ) acct-user/llama-cpp acct-group/llama-cpp " BDEPEND="media-libs/shaderc" pkg_setup() { if use rocm; then linux-info_pkg_setup if linux-info_get_any_version && linux_config_exists; then if ! linux_chkconfig_present HSA_AMD_SVM; then ewarn "ROCm/HIP requires HSA_AMD_SVM enabled in your kernel config." fi fi fi } src_prepare() { use cuda && cuda_src_prepare cmake_src_prepare if use examples; then mkdir -p "${BUILD_DIR}/tinyllamas" || die cp "${DISTDIR}/ggml-org_models_tinyllamas_stories15M-q4_0-${TINY_LLAMAS_COMMIT}.gguf" \ "${BUILD_DIR}/tinyllamas/stories15M-q4_0.gguf" || die fi } src_configure() { local mycmakeargs=( # -- Build options -- -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=$(usex examples) -DLLAMA_BUILD_SERVER=ON -DCMAKE_SKIP_BUILD_RPATH=ON -DGGML_RPC=ON -DLLAMA_CURL=$(usex curl) -DLLAMA_OPENSSL=$(usex openssl) -DBUILD_NUMBER="1" -DGENTOO_REMOVE_CMAKE_BLAS_HACK=ON # -- CPU feature flags -- # No -march=native: explicit mapping from CPU_FLAGS_X86 for # reproducible/portable builds. SSE4.2 is the baseline. -DGGML_NATIVE=0 -DGGML_SSE42=ON -DGGML_AVX=$(usex cpu_flags_x86_avx) -DGGML_AVX2=$(usex cpu_flags_x86_avx2) -DGGML_BMI2=$(usex cpu_flags_x86_bmi2) -DGGML_FMA=$(usex cpu_flags_x86_fma3) -DGGML_F16C=$(usex cpu_flags_x86_f16c) -DGGML_AVX_VNNI=$(usex cpu_flags_x86_avx_vnni) -DGGML_AVX512=$(usex cpu_flags_x86_avx512f) -DGGML_AVX512_VBMI=$(usex cpu_flags_x86_avx512vbmi) -DGGML_AVX512_VNNI=$(usex cpu_flags_x86_avx512_vnni) -DGGML_AVX512_BF16=$(usex cpu_flags_x86_avx512_bf16) # -- Backends -- -DGGML_CUDA=$(usex cuda) -DGGML_OPENCL=$(usex opencl) -DGGML_OPENMP=$(usex openmp) -DGGML_VULKAN=$(usex vulkan) # -- Install paths (avoid clashing with whisper.cpp) -- -DCMAKE_INSTALL_LIBDIR="${EPREFIX}/usr/$(get_libdir)/llama.cpp" -DCMAKE_INSTALL_RPATH="${EPREFIX}/usr/$(get_libdir)/llama.cpp" ) # BLAS vendor selection (mutually exclusive via REQUIRED_USE) if use openblas; then mycmakeargs+=( -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS ) fi if use blis; then mycmakeargs+=( -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ) fi if use flexiblas; then mycmakeargs+=( -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FlexiBLAS ) fi # CUDA: set host compiler and sandbox for device node symlinks if use cuda; then local -x CUDAHOSTCXX="$(cuda_gccdir)" cuda_add_sandbox addpredict "/dev/char/" fi # ROCm/HIP: use hipcc and set GPU architecture targets if use rocm; then rocm_use_hipcc mycmakeargs+=( -DGGML_HIP=ON -DAMDGPU_TARGETS=$(get_amdgpu_flags) -DGGML_HIP_ROCWMMA_FATTN=$(usex wmma) ) fi cmake_src_configure } src_install() { cmake_src_install dobin "${BUILD_DIR}/bin/rpc-server" # Remove installed headers to avoid clashing with whisper.cpp rm -rf "${ED}/usr/include" # Systemd service unit and environment configuration insinto /etc/llama-cpp doins "${FILESDIR}"/llama-server.conf systemd_dounit "${FILESDIR}"/llama-server.service # State directory for model storage keepdir /var/lib/llama-cpp/models } pkg_preinst() { keepdir /var/lib/llama-cpp/models fowners llama-cpp:llama-cpp /var/lib/llama-cpp /var/lib/llama-cpp/models fperms 0750 /var/lib/llama-cpp /var/lib/llama-cpp/models } pkg_postinst() { elog "Installed binaries: llama-server, llama-cli, llama-quantize, rpc-server, ..." elog "" elog "Running as a systemd service:" elog " 1. Place a GGUF model in /var/lib/llama-cpp/models/" elog " 2. Edit /etc/llama-cpp/llama-server.conf (set LLAMA_MODEL, LLAMA_THREADS)" elog " 3. systemctl enable --now llama-server" elog " The API is then available at http://\${LLAMA_HOST}:\${LLAMA_PORT} (OpenAI-compatible)" elog "" ewarn "Defaults in /etc/llama-cpp/llama-server.conf are CONSERVATIVE:" ewarn " LLAMA_THREADS=2 -- adjust to your physical core count (not SMT threads)!" ewarn " LLAMA_MODEL=... -- must point to an actual GGUF file!" ewarn "Without adjustment the service runs on 2 threads or fails to find a model." }