# Copyright 1999-2026 Gentoo Authors # Distributed under the terms of the GNU General Public License v2 EAPI=8 DISTUTILS_USE_PEP517=setuptools PYTHON_COMPAT=( python3_{12..14} ) DISTUTILS_SINGLE_IMPL=1 # upstream PyPI dist filename uses underscore (lm_eval-x.y.z.tar.gz), # project page is /pypi/lm-eval/ PYPI_PN="lm_eval" PYPI_NO_NORMALIZE=1 inherit distutils-r1 pypi DESCRIPTION="A framework for evaluating language models (lm-evaluation-harness)" HOMEPAGE=" https://github.com/EleutherAI/lm-evaluation-harness https://pypi.org/project/lm-eval/ " LICENSE="MIT" SLOT="0" KEYWORDS="~amd64" IUSE="+api ifeval math sentencepiece statsmodels vllm" # Core deps from pyproject.toml [project.dependencies] at v0.4.11. # Optional [project.optional-dependencies] groups are wired as USE flags # only where every dep is reachable in our overlay set: # api -> aiohttp, requests, tenacity, tqdm, tiktoken # ifeval -> langdetect, immutabledict, nltk>=3.9.1 # math -> sympy, antlr4-python3-runtime==4.11.*, math-verify # sentencepiece-> sentencepiece # statsmodels -> upstream "discrim_eval" extra (statsmodels) # vllm -> vllm # Other extras (hf, multilingual, ruler, wandb, japanese, longbench, # libra, ipex, gptq, gptqmodel, optimum, sparsify, audiolm_qwen, # unitxt, zeno, ibm_watsonx_ai, acpbench) gate on packages we do not # currently carry; users wanting them must `pip install lm_eval[]`. # # math: at lm_eval 0.4.11, lm_eval/tasks/minerva_math/utils.py asserts # version("antlr4-python3-runtime").startswith("4.11") # at task-load, so the antlr4-4.11* pin is load-bearing, not advisory # (verified upstream 2026-05-11). We carry the older # antlr4-python3-runtime-4.11.0 alongside ::gentoo's 4.13.2 for this; # flipping USE=math triggers the downgrade. # # ifeval: at lm_eval 0.4.11, lm_eval/tasks/leaderboard/ifeval/ # instructions_util.py asserts nltk>=3.9.1 at module import (older # nltk has a remote-code-exec via `punkt`, see EleutherAI/lm- # evaluation-harness#2210 and nltk/nltk#3266), so the >=nltk-3.9.1 # bound is load-bearing, not advisory (verified upstream 2026-05-11). # The task also auto-downloads the punkt_tab tokenizer at the same # import — users on offline hosts should seed ~/nltk_data ahead of # time (a bare TaskManager() that loads leaderboard_ifeval is enough # to trigger the fetch; not gated on actually running an eval). # # single-impl: sci-ml/{datasets,evaluate} are SINGLE_IMPL; rest of stack is # multi-impl, wrapped via python_gen_cond_dep. RDEPEND=" >=sci-ml/datasets-2.16.0[${PYTHON_SINGLE_USEDEP}] >=sci-ml/evaluate-0.4.0[${PYTHON_SINGLE_USEDEP}] vllm? ( >=dev-python/vllm-0.4.2[${PYTHON_SINGLE_USEDEP}] ) $(python_gen_cond_dep ' dev-python/dill[${PYTHON_USEDEP}] dev-python/jinja2[${PYTHON_USEDEP}] dev-python/jsonlines[${PYTHON_USEDEP}] dev-python/more-itertools[${PYTHON_USEDEP}] dev-python/numpy[${PYTHON_USEDEP}] dev-python/pytablewriter[${PYTHON_USEDEP}] dev-python/rouge-score[${PYTHON_USEDEP}] dev-python/sacrebleu[${PYTHON_USEDEP}] dev-python/sqlitedict[${PYTHON_USEDEP}] dev-python/typing-extensions[${PYTHON_USEDEP}] dev-python/word2number[${PYTHON_USEDEP}] dev-python/zstandard[${PYTHON_USEDEP}] dev-python/scikit-learn[${PYTHON_USEDEP}] api? ( dev-python/aiohttp[${PYTHON_USEDEP}] dev-python/requests[${PYTHON_USEDEP}] dev-python/tenacity[${PYTHON_USEDEP}] dev-python/tiktoken[${PYTHON_USEDEP}] dev-python/tqdm[${PYTHON_USEDEP}] ) ifeval? ( dev-python/langdetect[${PYTHON_USEDEP}] dev-python/immutabledict[${PYTHON_USEDEP}] >=dev-python/nltk-3.9.1[${PYTHON_USEDEP}] ) math? ( =dev-python/antlr4-python3-runtime-4.11*[${PYTHON_USEDEP}] >=dev-python/sympy-1.12[${PYTHON_USEDEP}] ~dev-python/math-verify-0.9.0[${PYTHON_USEDEP}] ) sentencepiece? ( >=sci-ml/sentencepiece-0.1.98[${PYTHON_USEDEP}] ) statsmodels? ( dev-python/statsmodels[${PYTHON_USEDEP}] ) ') "