# Copyright 2022-2025 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8
CMAKE_IN_SOURCE_BUILD=1
PYTHON_COMPAT=( python3_{10..13} )
DISTUTILS_USE_PEP517=setuptools
DISTUTILS_SINGLE_IMPL=1
DISTUTILS_EXT=1
ROCM_VERSION="5.7.1"
LLVM_COMPAT=( 17 18 19 20 )
LLVM_OPTIONAL=1

inherit cmake cuda cuda-extra distutils-r1 flag-o-matic llvm-r1 rocm toolchain-funcs

DESCRIPTION="Cross-platform inference and training machine-learning accelerator."
HOMEPAGE="https://onnxruntime.ai
		https://github.com/microsoft/onnxruntime"
SAFEINT_COMMIT=3.0.28
FLATBUFFERS_PV=23.5.26
DATE_PV=3.0.1
DLPACK_PV=0.6
SRC_URI="
	https://github.com/microsoft/${PN}/archive/refs/tags/v${PV}.tar.gz -> ${P}.tar.gz
	https://github.com/dcleblanc/SafeInt/archive/${SAFEINT_COMMIT}.tar.gz -> SafeInt-${SAFEINT_COMMIT:0:10}.tar.gz
	https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_PV}.tar.gz -> flatbuffers-${FLATBUFFERS_PV}.tar.gz
	https://github.com/HowardHinnant/date/archive/v${DATE_PV}.tar.gz -> hhdate-${DATE_PV}.tar.gz
	https://github.com/dmlc/dlpack/archive/refs/tags/v${DLPACK_PV}.tar.gz -> dlpack-${DLPACK_PV}.tar.gz
"
LICENSE="MIT"
SLOT="0/${PV}"
KEYWORDS="~amd64"
CPU_FLAGS="cpu_flags_x86_avx cpu_flags_x86_avx2 cpu_flags_x86_avx512f"
IUSE="benchmark cuda onednn cudnn debug hip +python migraphx mimalloc lto test tensorrt llvm xnnpack
${CPU_FLAGS}"
RESTRICT="mirror test"
REQUIRED_USE="
	cuda? ( cudnn !lto )
	hip? ( migraphx )
	|| ( cudnn migraphx onednn tensorrt )
"

RDEPEND="
	dev-libs/protobuf:=
"

BDEPEND="
	${PYTHON_DEPS}
	app-admin/chrpath
	benchmark? ( dev-cpp/benchmark )
	cuda? (
		dev-libs/cutlass:=
		>=dev-util/nvidia-cuda-toolkit-12:=
	)
	cudnn? (
		dev-libs/cudnn:=
		sci-ml/cudnn-frontend:=
	)
	>dev-cpp/eigen-3.4.0:=[cuda?]
	dev-cpp/ms-gsl:=
	dev-cpp/nlohmann_json
	sci-ml/FP16
	dev-libs/FXdiv
	dev-libs/clog:=
	dev-libs/cpuinfo:=
	dev-libs/date:=
	dev-libs/protobuf:=
	dev-libs/re2
	hip? (
		sci-libs/hipFFT:=
		sci-libs/hipCUB:=
		>=dev-libs/rocr-runtime-${ROCM_VERSION}:=
		>=dev-util/hip-${ROCM_VERSION}:=
	)
	mimalloc? ( dev-libs/mimalloc )
	onednn? ( sci-ml/oneDNN:= )
	python? (
		$(python_gen_cond_dep '
			dev-python/cerberus[${PYTHON_USEDEP}]
			dev-python/coloredlogs[${PYTHON_USEDEP}]
			dev-python/flatbuffers[${PYTHON_USEDEP}]
			dev-python/h5py[${PYTHON_USEDEP}]
			dev-python/numpy[${PYTHON_USEDEP}]
			dev-python/psutil[${PYTHON_USEDEP}]
			dev-python/py-cpuinfo[${PYTHON_USEDEP}]
			dev-python/sympy[${PYTHON_USEDEP}]
		')
	)
	sci-ml/onnx:=[disableStaticReg]
	sci-ml/pytorch
	xnnpack? ( sci-ml/XNNPACK )
"

PATCHES=(
	"${FILESDIR}/onnxruntime-system-dnnl.patch"
	"${FILESDIR}/re2-pkg-config-r4.patch"
	"${FILESDIR}/system-onnx-r6.patch"
	"${FILESDIR}/system-composable_kernel-r5.patch"
	"${FILESDIR}/system-protobuf-r1.patch"
	"${FILESDIR}/system-mp11.patch"
	"${FILESDIR}/system-gsl-r5.patch"
	"${FILESDIR}/shared-build-fix.patch"
	"${FILESDIR}/contrib-ops.patch"
	"${FILESDIR}/disabled_rules_and_transformers.patch"
	"${FILESDIR}/Werror.patch"
	"${FILESDIR}/onnxruntime-1.19.0-abseil.patch"
	"${FILESDIR}/onnxruntime-1.19.0-eigen.patch"
	"${FILESDIR}/onnxruntime-1.21.0-system-eigen.patch"
	"${FILESDIR}/onnxruntime-1.20.0-cudnn_frontend.patch"
	"${FILESDIR}/onnxruntime-1.23.0-external-downloads.patch"
	"${FILESDIR}/onnxruntime-1.23.0-include.patch"
)

pkg_setup() {
	use python && python-single-r1_pkg_setup
	use llvm && llvm-r1_pkg_setup
}

src_prepare() {
	CMAKE_USE_DIR="${S}/cmake"

	if use python; then
		export PYTHONPATH="${S}/onnxruntime/python:${PYTHONPATH}"
		python_setup
	fi

	use cuda && cuda_src_prepare

	# Workaround for binary drivers.
	addpredict /dev/ati
	addpredict /dev/dri
	addpredict /dev/nvidiactl

	# fix build with gcc12(?), take idea from https://github.com/microsoft/onnxruntime/pull/11667 and https://github.com/microsoft/onnxruntime/pull/10014
	sed 's|dims)|TensorShape(dims))|g' \
		-i onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq.cc || die "Sed failed"

	# fix missing #include <iostream>
	sed '11a#include <iostream>' -i orttraining/orttraining/test/training_api/trainer/trainer.cc

	sed 's/\"-mavx512f\"/\"-mavx512f -Wno-error\"/g' -i cmake/onnxruntime_mlas.cmake || die "Sed failed"

	strip-unsupported-flags
	append-flags -Wa,--noexecstack

	cmake_src_prepare

	# for some reason, "patch -p1" can't handle a path like "a/../foo/bar"
	cd ..
	patch -p0 -i "${FILESDIR}/onnxruntime-1.22.0-cmake-4.patch"
}

src_configure() {
	export ROCM_PATH=/usr MIOPEN_PATH=/usr
	export ROCM_VERSION

	use python && python_setup
	CMAKE_BUILD_TYPE=$(usex debug RelWithDebInfo Release)
	CMAKE_INSTALL_PREFIX="${EPREFIX}/usr"
	CMAKE_TLS_VERIFY=ON
	PYTHON_EXECUTABLE="/usr/bin/${EPYTHON}"
	PYTHON_INCLUDE_DIR="$(python_get_includedir)"
	PYTHON_LIBRARY="$(python_get_library_path)"

	append-cxxflags -Wno-dangling-reference -Wno-c++20-compat

	local mycmakeargs=(
		-DCMAKE_INSTALL_INCLUDEDIR="include/${PN}"
		-Donnxruntime_REQUIRE_PYTHON_EMBED_LIB=OFF
		-Donnxruntime_USE_FULL_PROTOBUF=OFF
		-Donnxruntime_ENABLE_LANGUAGE_INTEROP_OPS=OFF
		-Donnxruntime_BUILD_SHARED_LIB=ON
		-Donnxruntime_ENABLE_PYTHON=$(usex python)
		-Donnxruntime_BUILD_BENCHMARKS=$(usex benchmark)
		-Donnxruntime_BUILD_UNIT_TESTS=$(usex test)
		-Donnxruntime_RUN_ONNX_TESTS=$(usex test)
		-Donnxruntime_ENABLE_LAZY_TENSOR=OFF
		-Donnxruntime_USE_PREINSTALLED_EIGEN=ON
		-Donnxruntime_USE_DNNL=$(usex onednn)
		-Donnxruntime_USE_CUDA=$(usex cuda)
		-Donnxruntime_USE_ROCM=$(usex hip)
		-Donnxruntime_USE_AVX=$(usex cpu_flags_x86_avx)
		-Donnxruntime_USE_AVX2=$(usex cpu_flags_x86_avx2)
		-Donnxruntime_USE_AVX512=$(usex cpu_flags_x86_avx512f)
		-Donnxruntime_USE_MIMALLOC=$(usex mimalloc)
		-Donnxruntime_USE_XNNPACK=$(usex xnnpack)
		-Donnxruntime_ENABLE_LTO=$(usex lto)
		-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS
		-DFETCHCONTENT_FULLY_DISCONNECTED=ON
		-DFETCHCONTENT_QUIET=OFF
		-DFETCHCONTENT_SOURCE_DIR_SAFEINT="${WORKDIR}/SafeInt-${SAFEINT_COMMIT}"
		-DFETCHCONTENT_SOURCE_DIR_FLATBUFFERS="${WORKDIR}/flatbuffers-${FLATBUFFERS_PV}"
		-DFETCHCONTENT_SOURCE_DIR_DATE="${WORKDIR}/date-${DATE_PV}"
		-DFETCHCONTENT_SOURCE_DIR_DLPACK="${WORKDIR}/dlpack-${DLPACK_PV}"
		-Donnxruntime_USE_TENSORRT=$(usex tensorrt)
		-Donnxruntime_USE_JSEP=OFF
		-Donnxruntime_ENABLE_MEMORY_PROFILE=OFF
		-Donnxruntime_DISABLE_ABSEIL=ON
		-Donnxruntime_BUILD_FOR_NATIVE_MACHINE=OFF
		-Donnxruntime_USE_MIMALLOC=OFF
		-Donnxruntime_BUILD_CSHARP=OFF
		-Donnxruntime_BUILD_JAVA=OFF
		-Donnxruntime_BUILD_NODEJS=OFF
		-Donnxruntime_BUILD_OBJC=OFF
		-Donnxruntime_BUILD_APPLE_FRAMEWORK=OFF
		-Donnxruntime_USE_NNAPI_BUILTIN=OFF
		-Donnxruntime_USE_RKNPU=OFF
		-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=OFF
		-Donnxruntime_USE_VITISAI=OFF
		-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER=OFF
		-Donnxruntime_USE_MIGRAPHX=$(usex migraphx)
		-Donnxruntime_CROSS_COMPILING=$(tc-is-cross-compiler && echo ON || echo OFF)
		-Donnxruntime_DISABLE_CONTRIB_OPS=ON
		-Donnxruntime_DISABLE_ML_OPS=ON
		-Donnxruntime_DISABLE_RTTI=OFF
		-Donnxruntime_DISABLE_EXCEPTIONS=$(usex !debug)
		-Donnxruntime_MINIMAL_BUILD=OFF
		-Donnxruntime_EXTENDED_MINIMAL_BUILD=OFF
		-Donnxruntime_MINIMAL_BUILD_CUSTOM_OPS=OFF
		-Donnxruntime_REDUCED_OPS_BUILD=OFF
		-Donnxruntime_ENABLE_LANGUAGE_INTEROP_OPS=OFF
		-Donnxruntime_USE_DML=OFF
		-Donnxruntime_USE_WINML=OFF
		-Donnxruntime_BUILD_MS_EXPERIMENTAL_OPS=OFF
		-Donnxruntime_USE_TELEMETRY=OFF
		-Donnxruntime_USE_ACL=OFF
		-Donnxruntime_USE_ARMNN=OFF
		-Donnxruntime_ARMNN_RELU_USE_CPU=ON
		-Donnxruntime_ARMNN_BN_USE_CPU=ON
		-Donnxruntime_ENABLE_NVTX_PROFILE=OFF
		-Donnxruntime_ENABLE_TRAINING=OFF
		-Donnxruntime_ENABLE_TRAINING_OPS=OFF
		-Donnxruntime_ENABLE_TRAINING_APIS=OFF
		-Donnxruntime_ENABLE_CPU_FP16_OPS=OFF
		-Donnxruntime_USE_NCCL=OFF
		-Donnxruntime_GCOV_COVERAGE=OFF
		-Donnxruntime_ENABLE_MEMORY_PROFILE=OFF
		-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING=ON
		-Donnxruntime_ENABLE_WEBASSEMBLY_API_EXCEPTION_CATCHING=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_THROWING=ON
		-Donnxruntime_WEBASSEMBLY_RUN_TESTS_IN_BROWSER=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_THREADS=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO=OFF
		-Donnxruntime_ENABLE_WEBASSEMBLY_PROFILING=OFF
		-Donnxruntime_ENABLE_EAGER_MODE=OFF
		-Donnxruntime_ENABLE_LAZY_TENSOR=OFF
		-Donnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS=OFF
		-Donnxruntime_USE_CANN=OFF
		-Donnxruntime_PYBIND_EXPORT_OPSCHEMA=OFF
		-Donnxruntime_ENABLE_MEMLEAK_CHECKER=ON
	)

	if use cuda; then
		local cuda_architectures_real="$(cuda_get_host_native_arch | sed -e 's/$/-real/g' -e 's/;/-real;/g')"
		mycmakeargs+=(
			-Donnxruntime_CUDA_HOME=/opt/cuda
			-Donnxruntime_CUDNN_HOME=/usr
			-DCMAKE_CUDA_ARCHITECTURES="${cuda_architectures_real%%;}"
			-DCMAKE_CUDA_HOST_COMPILER="$(cuda_gccdir)"
			-DCMAKE_CUDA_FLAGS="-forward-unknown-opts -fno-lto ${NVCCFLAGS}"
			-DCMAKE_CUDA_STANDARD_REQUIRED=ON
			-DCMAKE_CXX_STANDARD_REQUIRED=ON
			-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=OFF
			-Donnxruntime_ENABLE_CUDA_PROFILING=OFF
			-Donnxruntime_USE_NCCL=OFF # Multi GPU CUDA
			-Donnxruntime_NVCC_THREADS=1
		)
	fi

	if use hip; then
		mycmakeargs+=(
		-DCMAKE_HIP_COMPILER="$(get_llvm_prefix)/bin/clang++"
		-DCMAKE_HIP_ARCHITECTURES="$(get_amdgpu_flags)"
	)
	fi

	cmake_src_configure
}

src_compile() {
	cmake_src_compile

	if use python; then
		cd cmake
		cp -a ../{setup.py,pyproject.toml,docs} .
		distutils-r1_src_compile
	fi
}

src_install() {
	cmake_src_install

	if use python; then
		cd cmake
		distutils-r1_src_install
	fi
}