From ae9742e9f4e37e8cdd4bb80d00bc91e1a69e2eaf Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Fri, 7 Mar 2025 19:28:39 +0100
Subject: [PATCH 04/12] cachy

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .../admin-guide/kernel-parameters.txt         |   12 +
 Documentation/admin-guide/sysctl/vm.rst       |   72 +
 Makefile                                      |    8 +
 arch/x86/Kconfig.cpu                          |  367 +-
 arch/x86/Makefile                             |   89 +-
 arch/x86/include/asm/pci.h                    |    6 +
 arch/x86/include/asm/vermagic.h               |   72 +
 arch/x86/pci/common.c                         |    7 +-
 block/Kconfig.iosched                         |    9 +
 block/Makefile                                |    8 +
 block/adios.c                                 | 1352 +++++++
 block/elevator.c                              |    8 +
 drivers/Makefile                              |   13 +-
 drivers/ata/ahci.c                            |   23 +-
 drivers/cpufreq/Kconfig.x86                   |    2 -
 drivers/cpufreq/intel_pstate.c                |    2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |    1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c  |   44 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h  |    1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |    6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |   10 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c       |   19 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.h       |    1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h      |    1 +
 drivers/gpu/drm/amd/display/Kconfig           |    6 +
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |   69 +-
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |    7 +
 .../amd/display/amdgpu_dm/amdgpu_dm_color.c   |    2 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c    |    6 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   |    6 +-
 .../drm/amd/display/dc/bios/bios_parser2.c    |   13 +-
 .../drm/amd/display/dc/core/dc_link_exports.c |    6 +
 drivers/gpu/drm/amd/display/dc/dc.h           |    3 +
 .../dc/resource/dce120/dce120_resource.c      |   17 +
 drivers/gpu/drm/amd/pm/amdgpu_pm.c            |    3 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     |   14 +-
 drivers/input/evdev.c                         |   19 +-
 drivers/md/dm-crypt.c                         |    5 +
 drivers/media/v4l2-core/Kconfig               |    5 +
 drivers/media/v4l2-core/Makefile              |    2 +
 drivers/media/v4l2-core/v4l2loopback.c        | 3184 +++++++++++++++++
 drivers/media/v4l2-core/v4l2loopback.h        |   98 +
 .../media/v4l2-core/v4l2loopback_formats.h    |  445 +++
 drivers/pci/controller/Makefile               |    6 +
 drivers/pci/controller/intel-nvme-remap.c     |  462 +++
 drivers/pci/quirks.c                          |  101 +
 drivers/scsi/Kconfig                          |    2 +
 drivers/scsi/Makefile                         |    1 +
 drivers/scsi/vhba/Kconfig                     |    9 +
 drivers/scsi/vhba/Makefile                    |    4 +
 drivers/scsi/vhba/vhba.c                      | 1130 ++++++
 include/linux/mm.h                            |    8 +
 include/linux/pagemap.h                       |    2 +-
 include/linux/user_namespace.h                |    4 +
 include/linux/wait.h                          |    2 +
 init/Kconfig                                  |   26 +
 kernel/Kconfig.hz                             |   24 +
 kernel/Kconfig.preempt                        |    2 +-
 kernel/fork.c                                 |   14 +
 kernel/locking/rwsem.c                        |    4 +-
 kernel/sched/fair.c                           |   13 +
 kernel/sched/sched.h                          |    2 +-
 kernel/sched/wait.c                           |   24 +
 kernel/sysctl.c                               |   46 +
 kernel/user_namespace.c                       |    7 +
 mm/Kconfig                                    |   65 +-
 mm/compaction.c                               |    4 +
 mm/huge_memory.c                              |    4 +
 mm/mm_init.c                                  |    1 +
 mm/page-writeback.c                           |    8 +
 mm/page_alloc.c                               |    4 +
 mm/swap.c                                     |    5 +
 mm/vmpressure.c                               |    4 +
 mm/vmscan.c                                   |  143 +
 net/ipv4/inet_connection_sock.c               |    2 +-
 75 files changed, 8083 insertions(+), 93 deletions(-)
 create mode 100644 block/adios.c
 create mode 100644 drivers/media/v4l2-core/v4l2loopback.c
 create mode 100644 drivers/media/v4l2-core/v4l2loopback.h
 create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h
 create mode 100644 drivers/pci/controller/intel-nvme-remap.c
 create mode 100644 drivers/scsi/vhba/Kconfig
 create mode 100644 drivers/scsi/vhba/Makefile
 create mode 100644 drivers/scsi/vhba/vhba.c

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3872bc6ec49d..5e8881ec6b40 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2256,6 +2256,9 @@
 			disable
 			  Do not enable intel_pstate as the default
 			  scaling driver for the supported processors
+			enable
+			  Enable intel_pstate in-case "disable" was passed
+			  previously in the kernel boot parameters
                         active
                           Use intel_pstate driver to bypass the scaling
                           governors layer of cpufreq and provides it own
@@ -4481,6 +4484,15 @@
 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
 				enabled, this kernel boot option can be used to
 				disable the use of MSI interrupts system-wide.
+		pcie_acs_override =
+					[PCIE] Override missing PCIe ACS support for:
+				downstream
+					All downstream ports - full ACS capabilities
+				multfunction
+					All multifunction devices - multifunction ACS subset
+				id:nnnn:nnnn
+					Specfic device - full ACS capabilities
+					Specified as vid:did (vendor/device ID) in hex
 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
 				Safety option to keep boot IRQs enabled. This
 				should never be necessary.
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index f48eaa98d22d..fc777c14cff6 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -25,6 +25,9 @@ files can be found in mm/swap.c.
 Currently, these files are in /proc/sys/vm:
 
 - admin_reserve_kbytes
+- anon_min_ratio
+- clean_low_ratio
+- clean_min_ratio
 - compact_memory
 - compaction_proactiveness
 - compact_unevictable_allowed
@@ -108,6 +111,67 @@ On x86_64 this is about 128MB.
 Changing this takes effect whenever an application requests memory.
 
 
+anon_min_ratio
+==============
+
+This knob provides *hard* protection of anonymous pages. The anonymous pages
+on the current node won't be reclaimed under any conditions when their amount
+is below vm.anon_min_ratio.
+
+This knob may be used to prevent excessive swap thrashing when anonymous
+memory is low (for example, when memory is going to be overfilled by
+compressed data of zram module).
+
+Setting this value too high (close to 100) can result in inability to
+swap and can lead to early OOM under memory pressure.
+
+The unit of measurement is the percentage of the total memory of the node.
+
+The default value is 15.
+
+
+clean_low_ratio
+================
+
+This knob provides *best-effort* protection of clean file pages. The file pages
+on the current node won't be reclaimed under memory pressure when the amount of
+clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM.
+
+Protection of clean file pages using this knob may be used when swapping is
+still possible to
+  - prevent disk I/O thrashing under memory pressure;
+  - improve performance in disk cache-bound tasks under memory pressure.
+
+Setting it to a high value may result in a early eviction of anonymous pages
+into the swap space by attempting to hold the protected amount of clean file
+pages in memory.
+
+The unit of measurement is the percentage of the total memory of the node.
+
+The default value is 0.
+
+
+clean_min_ratio
+================
+
+This knob provides *hard* protection of clean file pages. The file pages on the
+current node won't be reclaimed under memory pressure when the amount of clean
+file pages is below vm.clean_min_ratio.
+
+Hard protection of clean file pages using this knob may be used to
+  - prevent disk I/O thrashing under memory pressure even with no free swap space;
+  - improve performance in disk cache-bound tasks under memory pressure;
+  - avoid high latency and prevent livelock in near-OOM conditions.
+
+Setting it to a high value may result in a early out-of-memory condition due to
+the inability to reclaim the protected amount of clean file pages when other
+types of pages cannot be reclaimed.
+
+The unit of measurement is the percentage of the total memory of the node.
+
+The default value is 15.
+
+
 compact_memory
 ==============
 
@@ -964,6 +1028,14 @@ be 133 (x + 2x = 200, 2x = 133.33).
 At 0, the kernel will not initiate swap until the amount of free and
 file-backed pages is less than the high watermark in a zone.
 
+This knob has no effect if the amount of clean file pages on the current
+node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case,
+only anonymous pages can be reclaimed.
+
+If the number of anonymous pages on the current node is below
+vm.anon_min_ratio, then only file pages can be reclaimed with
+any vm.swappiness value.
+
 
 unprivileged_userfaultfd
 ========================
diff --git a/Makefile b/Makefile
index f49182f3bae1..5da1041d0860 100644
--- a/Makefile
+++ b/Makefile
@@ -860,11 +860,19 @@ KBUILD_CFLAGS	+= -fno-delete-null-pointer-checks
 ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
 KBUILD_CFLAGS += -O2
 KBUILD_RUSTFLAGS += -Copt-level=2
+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
+KBUILD_CFLAGS += -O3
+KBUILD_RUSTFLAGS += -Copt-level=3
 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS += -Os
 KBUILD_RUSTFLAGS += -Copt-level=s
 endif
 
+# Perform swing modulo scheduling immediately before the first scheduling pass.
+# This pass looks at innermost loops and reorders their instructions by
+# overlapping different iterations.
+KBUILD_CFLAGS += $(call cc-option,-fmodulo-sched -fmodulo-sched-allow-regmoves -fivopts -fmodulo-sched)
+
 # Always set `debug-assertions` and `overflow-checks` because their default
 # depends on `opt-level` and `debug-assertions`, respectively.
 KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n)
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 25c55cc17c5e..990c4e91e551 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -155,9 +155,8 @@ config MPENTIUM4
 		-Paxville
 		-Dempsey
 
-
 config MK6
-	bool "K6/K6-II/K6-III"
+	bool "AMD K6/K6-II/K6-III"
 	depends on X86_32
 	help
 	  Select this for an AMD K6-family processor.  Enables use of
@@ -165,7 +164,7 @@ config MK6
 	  flags to GCC.
 
 config MK7
-	bool "Athlon/Duron/K7"
+	bool "AMD Athlon/Duron/K7"
 	depends on X86_32
 	help
 	  Select this for an AMD Athlon K7-family processor.  Enables use of
@@ -173,12 +172,114 @@ config MK7
 	  flags to GCC.
 
 config MK8
-	bool "Opteron/Athlon64/Hammer/K8"
+	bool "AMD Opteron/Athlon64/Hammer/K8"
 	help
 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
 	  Enables use of some extended instructions, and passes appropriate
 	  optimization flags to GCC.
 
+config MK8SSE3
+	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+	help
+	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MK10
+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+	help
+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+	  Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MBARCELONA
+	bool "AMD Barcelona"
+	help
+	  Select this for AMD Family 10h Barcelona processors.
+
+	  Enables -march=barcelona
+
+config MBOBCAT
+	bool "AMD Bobcat"
+	help
+	  Select this for AMD Family 14h Bobcat processors.
+
+	  Enables -march=btver1
+
+config MJAGUAR
+	bool "AMD Jaguar"
+	help
+	  Select this for AMD Family 16h Jaguar processors.
+
+	  Enables -march=btver2
+
+config MBULLDOZER
+	bool "AMD Bulldozer"
+	help
+	  Select this for AMD Family 15h Bulldozer processors.
+
+	  Enables -march=bdver1
+
+config MPILEDRIVER
+	bool "AMD Piledriver"
+	help
+	  Select this for AMD Family 15h Piledriver processors.
+
+	  Enables -march=bdver2
+
+config MSTEAMROLLER
+	bool "AMD Steamroller"
+	help
+	  Select this for AMD Family 15h Steamroller processors.
+
+	  Enables -march=bdver3
+
+config MEXCAVATOR
+	bool "AMD Excavator"
+	help
+	  Select this for AMD Family 15h Excavator processors.
+
+	  Enables -march=bdver4
+
+config MZEN
+	bool "AMD Zen"
+	help
+	  Select this for AMD Family 17h Zen processors.
+
+	  Enables -march=znver1
+
+config MZEN2
+	bool "AMD Zen 2"
+	help
+	  Select this for AMD Family 17h Zen 2 processors.
+
+	  Enables -march=znver2
+
+config MZEN3
+	bool "AMD Zen 3"
+	depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+	  Select this for AMD Family 19h Zen 3 processors.
+
+	  Enables -march=znver3
+
+config MZEN4
+	bool "AMD Zen 4"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
+	help
+	  Select this for AMD Family 19h Zen 4 processors.
+
+	  Enables -march=znver4
+
+config MZEN5
+	bool "AMD Zen 5"
+	depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 190100)
+	help
+	  Select this for AMD Family 19h Zen 5 processors.
+
+	  Enables -march=znver5
+
 config MCRUSOE
 	bool "Crusoe"
 	depends on X86_32
@@ -269,8 +370,17 @@ config MPSC
 	  using the cpu family field
 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
 
+config MATOM
+	bool "Intel Atom"
+	help
+
+	  Select this for the Intel Atom platform. Intel Atom CPUs have an
+	  in-order pipelining architecture and thus can benefit from
+	  accordingly optimized code. Use a recent GCC with specific Atom
+	  support in order to fully benefit from selecting this option.
+
 config MCORE2
-	bool "Core 2/newer Xeon"
+	bool "Intel Core 2"
 	help
 
 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -278,14 +388,199 @@ config MCORE2
 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
 	  (not a typo)
 
-config MATOM
-	bool "Intel Atom"
+	  Enables -march=core2
+
+config MNEHALEM
+	bool "Intel Nehalem"
 	help
 
-	  Select this for the Intel Atom platform. Intel Atom CPUs have an
-	  in-order pipelining architecture and thus can benefit from
-	  accordingly optimized code. Use a recent GCC with specific Atom
-	  support in order to fully benefit from selecting this option.
+	  Select this for 1st Gen Core processors in the Nehalem family.
+
+	  Enables -march=nehalem
+
+config MWESTMERE
+	bool "Intel Westmere"
+	help
+
+	  Select this for the Intel Westmere formerly Nehalem-C family.
+
+	  Enables -march=westmere
+
+config MSILVERMONT
+	bool "Intel Silvermont"
+	help
+
+	  Select this for the Intel Silvermont platform.
+
+	  Enables -march=silvermont
+
+config MGOLDMONT
+	bool "Intel Goldmont"
+	help
+
+	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
+
+	  Enables -march=goldmont
+
+config MGOLDMONTPLUS
+	bool "Intel Goldmont Plus"
+	help
+
+	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
+
+	  Enables -march=goldmont-plus
+
+config MSANDYBRIDGE
+	bool "Intel Sandy Bridge"
+	help
+
+	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+	  Enables -march=sandybridge
+
+config MIVYBRIDGE
+	bool "Intel Ivy Bridge"
+	help
+
+	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+	  Enables -march=ivybridge
+
+config MHASWELL
+	bool "Intel Haswell"
+	help
+
+	  Select this for 4th Gen Core processors in the Haswell family.
+
+	  Enables -march=haswell
+
+config MBROADWELL
+	bool "Intel Broadwell"
+	help
+
+	  Select this for 5th Gen Core processors in the Broadwell family.
+
+	  Enables -march=broadwell
+
+config MSKYLAKE
+	bool "Intel Skylake"
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake family.
+
+	  Enables -march=skylake
+
+config MSKYLAKEX
+	bool "Intel Skylake X"
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake X family.
+
+	  Enables -march=skylake-avx512
+
+config MCANNONLAKE
+	bool "Intel Cannon Lake"
+	help
+
+	  Select this for 8th Gen Core processors
+
+	  Enables -march=cannonlake
+
+config MICELAKE_CLIENT
+	bool "Intel Ice Lake (Client)"
+	help
+
+	  Select this for 10th Gen Core processors in the Ice Lake family.
+
+	  Enables -march=icelake-client
+
+config MICELAKE_SERVER
+	bool "Intel Ice lake (Server)"
+	help
+
+	  Select this for the 3rd Gen Xeon processors in the Ice lake family.
+
+	  Enables -march=icelake-server
+
+config MCASCADELAKE
+	bool "Intel Cascade Lake"
+	help
+
+	  Select this for Xeon processors in the Cascade Lake family.
+
+	  Enables -march=cascadelake
+
+config MCOOPERLAKE
+	bool "Intel Cooper Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	help
+
+	  Select this for Xeon processors in the Cooper Lake family.
+
+	  Enables -march=cooperlake
+
+config MTIGERLAKE
+	bool "Intel Tiger Lake"
+	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	help
+
+	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
+
+	  Enables -march=tigerlake
+
+config MSAPPHIRERAPIDS
+	bool "Intel Sapphire Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+
+	  Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
+
+	  Enables -march=sapphirerapids
+
+config MROCKETLAKE
+	bool "Intel Rocket Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+
+	  Select this for eleventh-generation processors in the Rocket Lake family.
+
+	  Enables -march=rocketlake
+
+config MALDERLAKE
+	bool "Intel Alder Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+
+	  Select this for twelfth-generation processors in the Alder Lake family.
+
+	  Enables -march=alderlake
+
+config MRAPTORLAKE
+	bool "Intel Raptor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	help
+
+	  Select this for thirteenth-generation processors in the Raptor Lake family.
+
+	  Enables -march=raptorlake
+
+config MMETEORLAKE
+	bool "Intel Meteor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	help
+
+	  Select this for fourteenth-generation processors in the Meteor Lake family.
+
+	  Enables -march=meteorlake
+
+config MEMERALDRAPIDS
+	bool "Intel Emerald Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	help
+
+	  Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
+
+	  Enables -march=emeraldrapids
 
 config GENERIC_CPU
 	bool "Generic-x86-64"
@@ -294,6 +589,26 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config MNATIVE_INTEL
+	bool "Intel-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for AMD CPUs.  Intel Only!
+
+	  Enables -march=native
+
+config MNATIVE_AMD
+	bool "AMD-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for Intel CPUs.  AMD Only!
+
+	  Enables -march=native
+
 endchoice
 
 config X86_GENERIC
@@ -308,6 +623,30 @@ config X86_GENERIC
 	  This is really intended for distributors who need more
 	  generic optimizations.
 
+config X86_64_VERSION
+	int "x86-64 compiler ISA level"
+	range 1 4
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64 && GENERIC_CPU
+	help
+	  Specify a specific x86-64 compiler ISA level.
+
+	  There are three x86-64 ISA levels that work on top of
+	  the x86-64 baseline, namely: x86-64-v2, x86-64-v3, and x86-64-v4.
+
+	  x86-64-v2 brings support for vector instructions up to Streaming SIMD
+	  Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3
+	  (SSSE3), the POPCNT instruction, and CMPXCHG16B.
+
+	  x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional
+	  bit-manipulation instructions.
+
+	  x86-64-v4 is not included since the kernel does not use AVX512 instructions
+
+	  You can find the best version for your CPU by running one of the following:
+	  /lib/ld-linux-x86-64.so.2 --help | grep supported
+	  /lib64/ld-linux-x86-64.so.2 --help | grep supported
+
 #
 # Define implied options from the CPU selection here
 config X86_INTERNODE_CACHE_SHIFT
@@ -318,7 +657,7 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE_CLIENT || MICELAKE_SERVER || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 	default "4" if MELAN || M486SX || M486 || MGEODEGX1
 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
@@ -336,11 +675,11 @@ config X86_ALIGNMENT_16
 
 config X86_INTEL_USERCOPY
 	def_bool y
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE_CLIENT || MICELAKE_SERVER || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
 
 config X86_USE_PPRO_CHECKSUM
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE_CLIENT || MICELAKE_SERVER || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 #
 # P6_NOPs are a relatively minor optimization that require a family >=
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5b773b34768d..5f57fd8750c6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -182,15 +182,98 @@ else
         cflags-$(CONFIG_MK8)		+= -march=k8
         cflags-$(CONFIG_MPSC)		+= -march=nocona
         cflags-$(CONFIG_MCORE2)		+= -march=core2
-        cflags-$(CONFIG_MATOM)		+= -march=atom
-        cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
+        cflags-$(CONFIG_MATOM)		+= -march=bonnell
+        ifeq ($(CONFIG_X86_64_VERSION),1)
+          cflags-$(CONFIG_GENERIC_CPU)		+= -mtune=generic
+          rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
+        else
+          cflags-$(CONFIG_GENERIC_CPU)		+= -march=x86-64-v$(CONFIG_X86_64_VERSION)
+          rustflags-$(CONFIG_GENERIC_CPU)	+= -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION)
+        endif
+        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
+        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
+        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
+        cflags-$(CONFIG_MBOBCAT) 	+= -march=btver1
+        cflags-$(CONFIG_MJAGUAR) 	+= -march=btver2
+        cflags-$(CONFIG_MBULLDOZER) 	+= -march=bdver1
+        cflags-$(CONFIG_MPILEDRIVER)	+= -march=bdver2 -mno-tbm
+        cflags-$(CONFIG_MSTEAMROLLER) 	+= -march=bdver3 -mno-tbm
+        cflags-$(CONFIG_MEXCAVATOR) 	+= -march=bdver4 -mno-tbm
+        cflags-$(CONFIG_MZEN) 		+= -march=znver1
+        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
+        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
+        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
+        cflags-$(CONFIG_MZEN5) 	+= -march=znver5
+        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
+        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native -mno-tbm
+        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
+        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
+        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
+        cflags-$(CONFIG_MGOLDMONT) 	+= -march=goldmont
+        cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
+        cflags-$(CONFIG_MSANDYBRIDGE) 	+= -march=sandybridge
+        cflags-$(CONFIG_MIVYBRIDGE) 	+= -march=ivybridge
+        cflags-$(CONFIG_MHASWELL) 	+= -march=haswell
+        cflags-$(CONFIG_MBROADWELL) 	+= -march=broadwell
+        cflags-$(CONFIG_MSKYLAKE) 	+= -march=skylake
+        cflags-$(CONFIG_MSKYLAKEX) 	+= -march=skylake-avx512
+        cflags-$(CONFIG_MCANNONLAKE) 	+= -march=cannonlake
+        cflags-$(CONFIG_MICELAKE_CLIENT) 	+= -march=icelake-client
+        cflags-$(CONFIG_MICELAKE_SERVER) 	+= -march=icelake-server
+        cflags-$(CONFIG_MCASCADELAKE) 	+= -march=cascadelake
+        cflags-$(CONFIG_MCOOPERLAKE) 	+= -march=cooperlake
+        cflags-$(CONFIG_MTIGERLAKE) 	+= -march=tigerlake
+        cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
+        cflags-$(CONFIG_MROCKETLAKE) 	+= -march=rocketlake
+        cflags-$(CONFIG_MALDERLAKE) 	+= -march=alderlake
+        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
+        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
+        cflags-$(CONFIG_MEMERALDRAPIDS)	+= -march=emeraldrapids
         KBUILD_CFLAGS += $(cflags-y)
 
         rustflags-$(CONFIG_MK8)		+= -Ctarget-cpu=k8
         rustflags-$(CONFIG_MPSC)	+= -Ctarget-cpu=nocona
         rustflags-$(CONFIG_MCORE2)	+= -Ctarget-cpu=core2
         rustflags-$(CONFIG_MATOM)	+= -Ctarget-cpu=atom
-        rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
+        rustflags-$(CONFIG_MK8SSE3)	+= -Ctarget-cpu=k8-sse3
+        rustflags-$(CONFIG_MK10) 		+= -Ctarget-cpu=amdfam10
+        rustflags-$(CONFIG_MBARCELONA) 	+= -Ctarget-cpu=barcelona
+        rustflags-$(CONFIG_MBOBCAT) 	+= -Ctarget-cpu=btver1
+        rustflags-$(CONFIG_MJAGUAR) 	+= -Ctarget-cpu=btver2
+        rustflags-$(CONFIG_MBULLDOZER) 	+= -Ctarget-cpu=bdver1
+        rustflags-$(CONFIG_MPILEDRIVER)	+= -Ctarget-cpu=bdver2
+        rustflags-$(CONFIG_MSTEAMROLLER) 	+= -Ctarget-cpu=bdver3
+        rustflags-$(CONFIG_MEXCAVATOR) 	+= -Ctarget-cpu=bdver4
+        rustflags-$(CONFIG_MZEN) 		+= -Ctarget-cpu=znver1
+        rustflags-$(CONFIG_MZEN2) 	+= -Ctarget-cpu=znver2
+        rustflags-$(CONFIG_MZEN3) 	+= -Ctarget-cpu=znver3
+        rustflags-$(CONFIG_MZEN4) 	+= -Ctarget-cpu=znver4
+        rustflags-$(CONFIG_MZEN5) 	+= -Ctarget-cpu=znver5
+        rustflags-$(CONFIG_MNATIVE_INTEL) += -Ctarget-cpu=native
+        rustflags-$(CONFIG_MNATIVE_AMD) 	+= -Ctarget-cpu=native
+        rustflags-$(CONFIG_MNEHALEM) 	+= -Ctarget-cpu=nehalem
+        rustflags-$(CONFIG_MWESTMERE) 	+= -Ctarget-cpu=westmere
+        rustflags-$(CONFIG_MSILVERMONT) 	+= -Ctarget-cpu=silvermont
+        rustflags-$(CONFIG_MGOLDMONT) 	+= -Ctarget-cpu=goldmont
+        rustflags-$(CONFIG_MGOLDMONTPLUS) += -Ctarget-cpu=goldmont-plus
+        rustflags-$(CONFIG_MSANDYBRIDGE) 	+= -Ctarget-cpu=sandybridge
+        rustflags-$(CONFIG_MIVYBRIDGE) 	+= -Ctarget-cpu=ivybridge
+        rustflags-$(CONFIG_MHASWELL) 	+= -Ctarget-cpu=haswell
+        rustflags-$(CONFIG_MBROADWELL) 	+= -Ctarget-cpu=broadwell
+        rustflags-$(CONFIG_MSKYLAKE) 	+= -Ctarget-cpu=skylake
+        rustflags-$(CONFIG_MSKYLAKEX) 	+= -Ctarget-cpu=skylake-avx512
+        rustflags-$(CONFIG_MCANNONLAKE) 	+= -Ctarget-cpu=cannonlake
+        rustflags-$(CONFIG_MICELAKE_CLIENT) 	+= -Ctarget-cpu=icelake-client
+        rustflags-$(CONFIG_MICELAKE_SERVER) 	+= -Ctarget-cpu=icelake-server
+        rustflags-$(CONFIG_MCASCADELAKE) 	+= -Ctarget-cpu=cascadelake
+        rustflags-$(CONFIG_MCOOPERLAKE) 	+= -Ctarget-cpu=cooperlake
+        rustflags-$(CONFIG_MTIGERLAKE) 	+= -Ctarget-cpu=tigerlake
+        rustflags-$(CONFIG_MSAPPHIRERAPIDS) += -Ctarget-cpu=sapphirerapids
+        rustflags-$(CONFIG_MROCKETLAKE) 	+= -Ctarget-cpu=rocketlake
+        rustflags-$(CONFIG_MALDERLAKE) 	+= -Ctarget-cpu=alderlake
+        rustflags-$(CONFIG_MRAPTORLAKE) 	+= -Ctarget-cpu=raptorlake
+        rustflags-$(CONFIG_MMETEORLAKE) 	+= -Ctarget-cpu=meteorlake
+        rustflags-$(CONFIG_MEMERALDRAPIDS)	+= -Ctarget-cpu=emeraldrapids
         KBUILD_RUSTFLAGS += $(rustflags-y)
 
         KBUILD_CFLAGS += -mno-red-zone
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b3ab80a03365..5e883b397ff3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -26,6 +26,7 @@ struct pci_sysdata {
 #if IS_ENABLED(CONFIG_VMD)
 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
+	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
 };
 
 extern int pci_routeirq;
@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus)
 #define is_vmd(bus)		false
 #endif /* CONFIG_VMD */
 
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
index 75884d2cdec3..2fdae271f47f 100644
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -17,6 +17,56 @@
 #define MODULE_PROC_FAMILY "586MMX "
 #elif defined CONFIG_MCORE2
 #define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE_INTEL
+#define MODULE_PROC_FAMILY "NATIVE_INTEL "
+#elif defined CONFIG_MNATIVE_AMD
+#define MODULE_PROC_FAMILY "NATIVE_AMD "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MGOLDMONT
+#define MODULE_PROC_FAMILY "GOLDMONT "
+#elif defined CONFIG_MGOLDMONTPLUS
+#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
+#elif defined CONFIG_MSKYLAKEX
+#define MODULE_PROC_FAMILY "SKYLAKEX "
+#elif defined CONFIG_MCANNONLAKE
+#define MODULE_PROC_FAMILY "CANNONLAKE "
+#elif defined CONFIG_MICELAKE_CLIENT
+#define MODULE_PROC_FAMILY "ICELAKE_CLIENT "
+#elif defined CONFIG_MICELAKE_SERVER
+#define MODULE_PROC_FAMILY "ICELAKE_SERVER "
+#elif defined CONFIG_MCASCADELAKE
+#define MODULE_PROC_FAMILY "CASCADELAKE "
+#elif defined CONFIG_MCOOPERLAKE
+#define MODULE_PROC_FAMILY "COOPERLAKE "
+#elif defined CONFIG_MTIGERLAKE
+#define MODULE_PROC_FAMILY "TIGERLAKE "
+#elif defined CONFIG_MSAPPHIRERAPIDS
+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
+#elif defined CONFIG_ROCKETLAKE
+#define MODULE_PROC_FAMILY "ROCKETLAKE "
+#elif defined CONFIG_MALDERLAKE
+#define MODULE_PROC_FAMILY "ALDERLAKE "
+#elif defined CONFIG_MRAPTORLAKE
+#define MODULE_PROC_FAMILY "RAPTORLAKE "
+#elif defined CONFIG_MMETEORLAKE
+#define MODULE_PROC_FAMILY "METEORLAKE "
+#elif defined CONFIG_MEMERALDRAPIDS
+#define MODULE_PROC_FAMILY "EMERALDRAPIDS "
 #elif defined CONFIG_MATOM
 #define MODULE_PROC_FAMILY "ATOM "
 #elif defined CONFIG_M686
@@ -35,6 +85,28 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
+#elif defined CONFIG_MEXCAVATOR
+#define MODULE_PROC_FAMILY "EXCAVATOR "
+#elif defined CONFIG_MZEN
+#define MODULE_PROC_FAMILY "ZEN "
+#elif defined CONFIG_MZEN2
+#define MODULE_PROC_FAMILY "ZEN2 "
 #elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201..7c20387d8202 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
 		return 0;
 }
 
-#if IS_ENABLED(CONFIG_VMD)
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 {
+#if IS_ENABLED(CONFIG_VMD)
 	if (is_vmd(dev->bus))
 		return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+	if (is_nvme_remap(dev->bus))
+		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
 
 	return dev;
 }
-#endif
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 27f11320b8d1..dd9af8cd7bca 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -16,6 +16,15 @@ config MQ_IOSCHED_KYBER
 	  synchronous writes, it will self-tune queue depths to achieve that
 	  goal.
 
+config MQ_IOSCHED_ADIOS
+	tristate "Adaptive Deadline I/O scheduler"
+	default y
+	help
+	  ADIOS is a multi-queue I/O scheduler for the Linux kernel, based on
+	  mq-deadline and Kyber, with learning-based adaptive latency control.
+	  It aims to provide low latency for synchronous requests while
+	  maintaining high throughput for asynchronous requests and bulk I/O.
+
 config IOSCHED_BFQ
 	tristate "BFQ I/O scheduler"
 	select BLK_ICQ
diff --git a/block/Makefile b/block/Makefile
index ddfd21c1a9ff..ebe14e94d123 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
 obj-$(CONFIG_BLK_CGROUP_IOCOST)	+= blk-iocost.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_ADIOS)	+= adios.o
 bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
 obj-$(CONFIG_IOSCHED_BFQ)	+= bfq.o
 
@@ -38,3 +39,10 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
 					   blk-crypto-sysfs.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
+
+all:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
+
diff --git a/block/adios.c b/block/adios.c
new file mode 100644
index 000000000000..301beb812ebb
--- /dev/null
+++ b/block/adios.c
@@ -0,0 +1,1352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The Adaptive Deadline I/O Scheduler (ADIOS)
+ * Based on mq-deadline and Kyber,
+ * with learning-based adaptive latency control
+ *
+ * Copyright (C) 2025 Masahito Suzuki
+ */
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/math.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#include "elevator.h"
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+
+#define ADIOS_VERSION "1.5.3"
+
+// Define operation types supported by ADIOS
+enum adios_op_type {
+	ADIOS_READ    = 0,
+	ADIOS_WRITE   = 1,
+	ADIOS_DISCARD = 2,
+	ADIOS_OTHER   = 3,
+	ADIOS_OPTYPES = 4,
+};
+
+// Global variable to control the latency
+static u64 default_global_latency_window = 16000000ULL;
+// Ratio below which batch queues should be refilled
+static u8  default_bq_refill_below_ratio = 15;
+
+// Dynamic thresholds for shrinkage
+static u32 default_lm_shrink_at_kreqs  = 10000;
+static u32 default_lm_shrink_at_gbytes =   100;
+static u32 default_lm_shrink_resist    =     2;
+
+// Latency targets for each operation type
+static u64 default_latency_target[ADIOS_OPTYPES] = {
+	[ADIOS_READ]    =     1ULL * NSEC_PER_MSEC,
+	[ADIOS_WRITE]   =  2000ULL * NSEC_PER_MSEC,
+	[ADIOS_DISCARD] =  8000ULL * NSEC_PER_MSEC,
+	[ADIOS_OTHER]   =     0ULL * NSEC_PER_MSEC,
+};
+
+// Maximum batch size limits for each operation type
+static u32 default_batch_limit[ADIOS_OPTYPES] = {
+	[ADIOS_READ]    = 24,
+	[ADIOS_WRITE]   = 48,
+	[ADIOS_DISCARD] =  1,
+	[ADIOS_OTHER]   =  1,
+};
+
+static u32 default_dl_prio[2] = {
+	[0] = 7,
+	[1] = 0,
+};
+
+// Thresholds for latency model control
+#define LM_BLOCK_SIZE_THRESHOLD 4096
+#define LM_SAMPLES_THRESHOLD    1024
+#define LM_INTERVAL_THRESHOLD   1500
+#define LM_OUTLIER_PERCENTILE     99
+#define LM_LAT_BUCKET_COUNT       64
+
+// Structure to hold latency bucket data for small requests
+struct latency_bucket_small {
+	u64 sum_latency;
+	u32 count;
+};
+
+// Structure to hold latency bucket data for large requests
+struct latency_bucket_large {
+	u64 sum_latency;
+	u64 sum_block_size;
+	u32 count;
+};
+
+// Structure to hold the latency model context data
+struct latency_model {
+	spinlock_t lock;
+	u64 base;
+	u64 slope;
+	u64 small_sum_delay;
+	u64 small_count;
+	u64 large_sum_delay;
+	u64 large_sum_bsize;
+	u64 last_update_jiffies;
+
+	spinlock_t buckets_lock;
+	struct latency_bucket_small small_bucket[LM_LAT_BUCKET_COUNT];
+	struct latency_bucket_large large_bucket[LM_LAT_BUCKET_COUNT];
+
+	u32 lm_shrink_at_kreqs;
+	u32 lm_shrink_at_gbytes;
+	u8  lm_shrink_resist;
+};
+
+#define ADIOS_BQ_PAGES 2
+
+// Adios scheduler data
+struct adios_data {
+	spinlock_t pq_lock;
+	struct list_head prio_queue;
+
+	struct rb_root_cached dl_tree[2];
+	spinlock_t lock;
+	u8  dl_queued;
+	s64 dl_bias;
+	s32 dl_prio[2];
+
+	u64 global_latency_window;
+	u64 latency_target[ADIOS_OPTYPES];
+	u32 batch_limit[ADIOS_OPTYPES];
+	u32 batch_actual_max_size[ADIOS_OPTYPES];
+	u32 batch_actual_max_total;
+	u32 async_depth;
+	u8  bq_refill_below_ratio;
+
+	u8 bq_page;
+	bool more_bq_ready;
+	struct list_head batch_queue[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
+	u32 batch_count[ADIOS_BQ_PAGES][ADIOS_OPTYPES];
+	spinlock_t bq_lock;
+
+	struct latency_model latency_model[ADIOS_OPTYPES];
+	struct timer_list update_timer;
+
+	atomic64_t total_pred_lat;
+
+	struct kmem_cache *rq_data_pool;
+	struct kmem_cache *dl_group_pool;
+};
+
+// List of requests with the same deadline in the deadline-sorted tree
+struct dl_group {
+	struct rb_node node;
+	struct list_head rqs;
+	u64 deadline;
+} __attribute__((aligned(64)));
+
+// Structure to hold scheduler-specific data for each request
+struct adios_rq_data {
+	struct list_head *dl_group;
+	struct list_head dl_node;
+
+	struct request *rq;
+	u64 deadline;
+	u64 pred_lat;
+	u32 block_size;
+} __attribute__((aligned(64)));
+
+static const int adios_prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+// Count the number of entries in small buckets
+static u32 lm_count_small_entries(struct latency_model *model) {
+	u32 total_count = 0;
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
+		total_count += model->small_bucket[i].count;
+	return total_count;
+}
+
+// Update the small buckets in the latency model
+static bool lm_update_small_buckets(struct latency_model *model,
+		u32 total_count, bool count_all) {
+	u64 sum_latency = 0;
+	u32 sum_count = 0;
+	u32 cumulative_count = 0, threshold_count = 0;
+	u8  outlier_threshold_bucket = 0;
+	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
+	u8  reduction;
+
+	if (count_all)
+		outlier_percentile = 100;
+
+	// Calculate the threshold count for outlier detection
+	threshold_count = (total_count * outlier_percentile) / 100;
+
+	// Identify the bucket that corresponds to the outlier threshold
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+		cumulative_count += model->small_bucket[i].count;
+		if (cumulative_count >= threshold_count) {
+			outlier_threshold_bucket = i;
+			break;
+		}
+	}
+
+	// Calculate the average latency, excluding outliers
+	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
+		struct latency_bucket_small *bucket = &model->small_bucket[i];
+		if (i < outlier_threshold_bucket) {
+			sum_latency += bucket->sum_latency;
+			sum_count += bucket->count;
+		} else {
+			// The threshold bucket's contribution is proportional
+			u64 remaining_count =
+				threshold_count - (cumulative_count - bucket->count);
+			if (bucket->count > 0) {
+				sum_latency +=
+					(bucket->sum_latency * remaining_count) / bucket->count;
+				sum_count += remaining_count;
+			}
+		}
+	}
+
+	// Shrink the model if it reaches at the readjustment threshold
+	if (model->small_count >= 1000ULL * model->lm_shrink_at_kreqs) {
+		reduction = model->lm_shrink_resist;
+		if (model->small_count >> reduction) {
+			model->small_sum_delay -= model->small_sum_delay >> reduction;
+			model->small_count     -= model->small_count     >> reduction;
+		}
+	}
+
+	// Accumulate the average latency into the statistics
+	model->small_sum_delay += sum_latency;
+	model->small_count += sum_count;
+
+	// Reset small bucket information
+	memset(model->small_bucket, 0,
+		sizeof(model->small_bucket[0]) * LM_LAT_BUCKET_COUNT);
+
+	return true;
+}
+
+// Count the number of entries in large buckets
+static u32 lm_count_large_entries(struct latency_model *model) {
+	u32 total_count = 0;
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++)
+		total_count += model->large_bucket[i].count;
+	return total_count;
+}
+
+// Update the large buckets in the latency model
+static bool lm_update_large_buckets(
+		struct latency_model *model,
+		u32 total_count, bool count_all) {
+	s64 sum_latency = 0;
+	u64 sum_block_size = 0, intercept;
+	u32 cumulative_count = 0, threshold_count = 0;
+	u8  outlier_threshold_bucket = 0;
+	u8  outlier_percentile = LM_OUTLIER_PERCENTILE;
+	u8  reduction;
+
+	if (count_all)
+		outlier_percentile = 100;
+
+	// Calculate the threshold count for outlier detection
+	threshold_count = (total_count * outlier_percentile) / 100;
+
+	// Identify the bucket that corresponds to the outlier threshold
+	for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) {
+		cumulative_count += model->large_bucket[i].count;
+		if (cumulative_count >= threshold_count) {
+			outlier_threshold_bucket = i;
+			break;
+		}
+	}
+
+	// Calculate the average latency and block size, excluding outliers
+	for (u8 i = 0; i <= outlier_threshold_bucket; i++) {
+		struct latency_bucket_large *bucket = &model->large_bucket[i];
+		if (i < outlier_threshold_bucket) {
+			sum_latency += bucket->sum_latency;
+			sum_block_size += bucket->sum_block_size;
+		} else {
+			// The threshold bucket's contribution is proportional
+			u64 remaining_count =
+				threshold_count - (cumulative_count - bucket->count);
+			if (bucket->count > 0) {
+				sum_latency +=
+					(bucket->sum_latency * remaining_count) / bucket->count;
+				sum_block_size +=
+					(bucket->sum_block_size * remaining_count) / bucket->count;
+			}
+		}
+	}
+
+	// Shrink the model if it reaches at the readjustment threshold
+	if (model->large_sum_bsize >= 0x40000000ULL * model->lm_shrink_at_gbytes) {
+		reduction = model->lm_shrink_resist;
+		if (model->large_sum_bsize >> reduction) {
+			model->large_sum_delay -= model->large_sum_delay >> reduction;
+			model->large_sum_bsize -= model->large_sum_bsize >> reduction;
+		}
+	}
+
+	// Accumulate the average delay into the statistics
+	intercept = model->base * threshold_count;
+	if (sum_latency > intercept)
+		sum_latency -= intercept;
+
+	model->large_sum_delay += sum_latency;
+	model->large_sum_bsize += sum_block_size;
+
+	// Reset large bucket information
+	memset(model->large_bucket, 0,
+		sizeof(model->large_bucket[0]) * LM_LAT_BUCKET_COUNT);
+
+	return true;
+}
+
+// Update the latency model parameters and statistics
+static void latency_model_update(struct latency_model *model) {
+	unsigned long flags;
+	u64 now;
+	u32 small_count, large_count;
+	bool time_elapsed;
+	bool small_processed = false, large_processed = false;
+
+	guard(spinlock_irqsave)(&model->lock);
+
+	spin_lock_irqsave(&model->buckets_lock, flags);
+
+	// Whether enough time has elapsed since the last update
+	now = jiffies;
+	time_elapsed = unlikely(!model->base) || model->last_update_jiffies +
+		msecs_to_jiffies(LM_INTERVAL_THRESHOLD) <= now;
+
+	// Count the number of entries in buckets
+	small_count = lm_count_small_entries(model);
+	large_count = lm_count_large_entries(model);
+
+	// Update small buckets
+	if (small_count && (time_elapsed ||
+			LM_SAMPLES_THRESHOLD <= small_count || !model->base))
+		small_processed = lm_update_small_buckets(
+			model, small_count, !model->base);
+	// Update large buckets
+	if (large_count && (time_elapsed ||
+			LM_SAMPLES_THRESHOLD <= large_count || !model->slope))
+		large_processed = lm_update_large_buckets(
+			model, large_count, !model->slope);
+
+	spin_unlock_irqrestore(&model->buckets_lock, flags);
+
+	// Update the base parameter if small bucket was processed
+	if (small_processed && likely(model->small_count))
+		model->base = div_u64(model->small_sum_delay, model->small_count);
+
+	// Update the slope parameter if large bucket was processed
+	if (large_processed && likely(model->large_sum_bsize))
+		model->slope = div_u64(model->large_sum_delay,
+			DIV_ROUND_UP_ULL(model->large_sum_bsize, 1024));
+
+	// Reset statistics and update last updated jiffies if time has elapsed
+	if (time_elapsed)
+		model->last_update_jiffies = now;
+}
+
+// Determine the bucket index for a given measured and predicted latency
+static u8 lm_input_bucket_index(
+		struct latency_model *model, u64 measured, u64 predicted) {
+	u8 bucket_index;
+
+	if (measured < predicted * 2)
+		bucket_index = (measured * 20) / predicted;
+	else if (measured < predicted * 5)
+		bucket_index = (measured * 10) / predicted + 20;
+	else
+		bucket_index = (measured * 3) / predicted + 40;
+
+	return bucket_index;
+}
+
+// Input latency data into the latency model
+static void latency_model_input(struct latency_model *model,
+		u32 block_size, u64 latency, u64 pred_lat) {
+	unsigned long flags;
+	u8 bucket_index;
+
+	spin_lock_irqsave(&model->buckets_lock, flags);
+
+	if (block_size <= LM_BLOCK_SIZE_THRESHOLD) {
+		// Handle small requests
+
+		bucket_index =
+			lm_input_bucket_index(model, latency, (model->base ?: 1));
+
+		if (bucket_index >= LM_LAT_BUCKET_COUNT)
+			bucket_index = LM_LAT_BUCKET_COUNT - 1;
+
+		model->small_bucket[bucket_index].count++;
+		model->small_bucket[bucket_index].sum_latency += latency;
+
+		if (unlikely(!model->base)) {
+			spin_unlock_irqrestore(&model->buckets_lock, flags);
+			latency_model_update(model);
+			return;
+		}
+	} else {
+		// Handle large requests
+		if (!model->base || !pred_lat) {
+			spin_unlock_irqrestore(&model->buckets_lock, flags);
+			return;
+		}
+
+		bucket_index =
+			lm_input_bucket_index(model, latency, pred_lat);
+
+		if (bucket_index >= LM_LAT_BUCKET_COUNT)
+			bucket_index = LM_LAT_BUCKET_COUNT - 1;
+
+		model->large_bucket[bucket_index].count++;
+		model->large_bucket[bucket_index].sum_latency += latency;
+		model->large_bucket[bucket_index].sum_block_size += block_size;
+	}
+
+	spin_unlock_irqrestore(&model->buckets_lock, flags);
+}
+
+// Predict the latency for a given block size using the latency model
+static u64 latency_model_predict(struct latency_model *model, u32 block_size) {
+	u64 result;
+
+	guard(spinlock_irqsave)(&model->lock);
+	// Predict latency based on the model
+	result = model->base;
+	if (block_size > LM_BLOCK_SIZE_THRESHOLD)
+		result += model->slope *
+			DIV_ROUND_UP_ULL(block_size - LM_BLOCK_SIZE_THRESHOLD, 1024);
+
+	return result;
+}
+
+// Determine the type of operation based on request flags
+static u8 adios_optype(struct request *rq) {
+	blk_opf_t opf = rq->cmd_flags;
+	switch (opf & REQ_OP_MASK) {
+	case REQ_OP_READ:
+		return ADIOS_READ;
+	case REQ_OP_WRITE:
+		return ADIOS_WRITE;
+	case REQ_OP_DISCARD:
+		return ADIOS_DISCARD;
+	default:
+		return ADIOS_OTHER;
+	}
+}
+
+static inline u8 adios_optype_not_read(struct request *rq) {
+	return (rq->cmd_flags & REQ_OP_MASK) != REQ_OP_READ;
+}
+
+// Helper function to retrieve adios_rq_data from a request
+static inline struct adios_rq_data *get_rq_data(struct request *rq) {
+	return (struct adios_rq_data *)rq->elv.priv[0];
+}
+
+// Add a request to the deadline-sorted red-black tree
+static void add_to_dl_tree(
+		struct adios_data *ad, bool dl_idx, struct request *rq) {
+	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
+	struct rb_node **link = &(root->rb_root.rb_node), *parent = NULL;
+	bool leftmost = true;
+	struct adios_rq_data *rd = get_rq_data(rq);
+	struct dl_group *dlg;
+
+	rd->block_size = blk_rq_bytes(rq);
+	u8 optype = adios_optype(rq);
+	rd->pred_lat =
+		latency_model_predict(&ad->latency_model[optype], rd->block_size);
+	rd->deadline =
+		rq->start_time_ns + ad->latency_target[optype] + rd->pred_lat;
+
+	while (*link) {
+		dlg = rb_entry(*link, struct dl_group, node);
+		s64 diff = rd->deadline - dlg->deadline;
+
+		parent = *link;
+		if (diff < 0) {
+			link = &((*link)->rb_left);
+		} else if (diff > 0) {
+			link = &((*link)->rb_right);
+			leftmost = false;
+		} else { // diff == 0
+			goto found;
+		}
+	}
+
+	dlg = rb_entry_safe(parent, struct dl_group, node);
+	if (!dlg || dlg->deadline != rd->deadline) {
+		dlg = kmem_cache_zalloc(ad->dl_group_pool, GFP_ATOMIC);
+		if (!dlg)
+			return;
+		dlg->deadline = rd->deadline;
+		INIT_LIST_HEAD(&dlg->rqs);
+		rb_link_node(&dlg->node, parent, link);
+		rb_insert_color_cached(&dlg->node, root, leftmost);
+	}
+found:
+	list_add_tail(&rd->dl_node, &dlg->rqs);
+	rd->dl_group = &dlg->rqs;
+	ad->dl_queued |= 1 << dl_idx;
+}
+
+// Remove a request from the deadline-sorted red-black tree
+static void del_from_dl_tree(
+		struct adios_data *ad, bool dl_idx, struct request *rq) {
+	struct rb_root_cached *root = &ad->dl_tree[dl_idx];
+	struct adios_rq_data *rd = get_rq_data(rq);
+	struct dl_group *dlg = container_of(rd->dl_group, struct dl_group, rqs);
+
+	list_del_init(&rd->dl_node);
+	if (list_empty(&dlg->rqs)) {
+		rb_erase_cached(&dlg->node, root);
+		kmem_cache_free(ad->dl_group_pool, dlg);
+	}
+	rd->dl_group = NULL;
+
+	if (RB_EMPTY_ROOT(&ad->dl_tree[dl_idx].rb_root))
+		ad->dl_queued &= ~(1 << dl_idx);
+}
+
+// Remove a request from the scheduler
+static void remove_request(struct adios_data *ad, struct request *rq) {
+	bool dl_idx = adios_optype_not_read(rq);
+	struct request_queue *q = rq->q;
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	list_del_init(&rq->queuelist);
+
+	// We might not be on the rbtree, if we are doing an insert merge
+	if (rd->dl_group)
+		del_from_dl_tree(ad, dl_idx, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+}
+
+// Convert a queue depth to the corresponding word depth for shallow allocation
+static int to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) {
+	struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
+	const unsigned int nrr = hctx->queue->nr_requests;
+
+	return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
+}
+
+// Limit the depth of request allocation for asynchronous and write requests
+static void adios_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) {
+	struct adios_data *ad = data->q->elevator->elevator_data;
+
+	// Do not throttle synchronous reads
+	if (op_is_sync(opf) && !op_is_write(opf))
+		return;
+
+	data->shallow_depth = to_word_depth(data->hctx, ad->async_depth);
+}
+
+// Update async_depth when the number of requests in the queue changes
+static void adios_depth_updated(struct blk_mq_hw_ctx *hctx) {
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct blk_mq_tags *tags = hctx->sched_tags;
+
+	ad->async_depth = q->nr_requests;
+
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
+}
+
+// Handle request merging after a merge operation
+static void adios_request_merged(struct request_queue *q, struct request *req,
+				  enum elv_merge type) {
+	bool dl_idx = adios_optype_not_read(req);
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	// if the merge was a front merge, we need to reposition request
+	if (type == ELEVATOR_FRONT_MERGE) {
+		del_from_dl_tree(ad, dl_idx, req);
+		add_to_dl_tree(ad, dl_idx, req);
+	}
+}
+
+// Handle merging of requests after one has been merged into another
+static void adios_merged_requests(struct request_queue *q, struct request *req,
+				   struct request *next) {
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	lockdep_assert_held(&ad->lock);
+
+	// kill knowledge of next, this one is a goner
+	remove_request(ad, next);
+}
+
+// Try to merge a bio into an existing rq before associating it with an rq
+static bool adios_bio_merge(struct request_queue *q, struct bio *bio,
+		unsigned int nr_segs) {
+	unsigned long flags;
+	struct adios_data *ad = q->elevator->elevator_data;
+	struct request *free = NULL;
+	bool ret;
+
+	spin_lock_irqsave(&ad->lock, flags);
+	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+	spin_unlock_irqrestore(&ad->lock, flags);
+
+	if (free)
+		blk_mq_free_request(free);
+
+	return ret;
+}
+
+// Insert a request into the scheduler
+static void insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+				  blk_insert_t insert_flags, struct list_head *free) {
+	unsigned long flags;
+	bool dl_idx = adios_optype_not_read(rq);
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+
+	lockdep_assert_held(&ad->lock);
+
+	if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
+		spin_lock_irqsave(&ad->pq_lock, flags);
+		list_add(&rq->queuelist, &ad->prio_queue);
+		spin_unlock_irqrestore(&ad->pq_lock, flags);
+		return;
+	}
+
+	if (blk_mq_sched_try_insert_merge(q, rq, free))
+		return;
+
+	add_to_dl_tree(ad, dl_idx, rq);
+
+	if (rq_mergeable(rq)) {
+		elv_rqhash_add(q, rq);
+		if (!q->last_merge)
+			q->last_merge = rq;
+	}
+}
+
+// Insert multiple requests into the scheduler
+static void adios_insert_requests(struct blk_mq_hw_ctx *hctx,
+				   struct list_head *list,
+				   blk_insert_t insert_flags) {
+	unsigned long flags;
+	struct request_queue *q = hctx->queue;
+	struct adios_data *ad = q->elevator->elevator_data;
+	LIST_HEAD(free);
+
+	spin_lock_irqsave(&ad->lock, flags);
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		insert_request(hctx, rq, insert_flags, &free);
+	}
+	spin_unlock_irqrestore(&ad->lock, flags);
+
+	blk_mq_free_requests(&free);
+}
+
+// Prepare a request before it is inserted into the scheduler
+static void adios_prepare_request(struct request *rq) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+	struct adios_rq_data *rd;
+
+	rq->elv.priv[0] = NULL;
+
+	/* Allocate adios_rq_data from the memory pool */
+	rd = kmem_cache_zalloc(ad->rq_data_pool, GFP_ATOMIC);
+	if (WARN(!rd, "adios_prepare_request: "
+			"Failed to allocate memory from rq_data_pool. rd is NULL\n"))
+		return;
+
+	rd->rq = rq;
+	rq->elv.priv[0] = rd;
+}
+
+static struct adios_rq_data *get_dl_first_rd(struct adios_data *ad, bool idx) {
+	struct rb_root_cached *root = &ad->dl_tree[idx];
+	struct rb_node *first = rb_first_cached(root);
+	struct dl_group *dl_group = rb_entry(first, struct dl_group, node);
+	struct adios_rq_data *rd =
+		list_first_entry(&dl_group->rqs, struct adios_rq_data, dl_node);
+
+	return rd;
+}
+
+// Select the next request to dispatch from the deadline-sorted red-black tree
+static struct request *next_request(struct adios_data *ad) {
+	struct adios_rq_data *rd;
+	bool dl_idx, bias_idx, reduce_bias;
+
+	if (!ad->dl_queued)
+		return NULL;
+
+	dl_idx = ad->dl_queued >> 1;
+	rd = get_dl_first_rd(ad, dl_idx);
+
+	bias_idx = ad->dl_bias < 0;
+	reduce_bias = (bias_idx == dl_idx);
+
+	if (ad->dl_queued == 0x3) {
+		struct adios_rq_data *trd[2];
+		trd[0] = get_dl_first_rd(ad, 0);
+		trd[1] = rd;
+
+		rd = trd[bias_idx];
+
+		reduce_bias =
+			(trd[bias_idx]->deadline > trd[((u8)bias_idx + 1) % 2]->deadline);
+	}
+
+	if (reduce_bias) {
+		s64 sign = ((int)bias_idx << 1) - 1;
+		if (unlikely(!rd->pred_lat))
+			ad->dl_bias = sign;
+		else {
+			ad->dl_bias += sign * (s64)((rd->pred_lat *
+				adios_prio_to_weight[ad->dl_prio[bias_idx] + 20]) >> 10);
+		}
+	}
+
+	return rd->rq;
+}
+
+// Reset the batch queue counts for a given page
+static void reset_batch_counts(struct adios_data *ad, u8 page) {
+	memset(&ad->batch_count[page], 0, sizeof(ad->batch_count[page]));
+}
+
+// Initialize all batch queues
+static void init_batch_queues(struct adios_data *ad) {
+	for (u8 page = 0; page < ADIOS_BQ_PAGES; page++) {
+		reset_batch_counts(ad, page);
+
+		for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++)
+			INIT_LIST_HEAD(&ad->batch_queue[page][optype]);
+	}
+}
+
+// Fill the batch queues with requests from the deadline-sorted red-black tree
+static bool fill_batch_queues(struct adios_data *ad, u64 current_lat) {
+	unsigned long flags;
+	u32 count = 0;
+	u32 optype_count[ADIOS_OPTYPES];
+	memset(optype_count, 0, sizeof(optype_count));
+	u8 page = (ad->bq_page + 1) % ADIOS_BQ_PAGES;
+
+	reset_batch_counts(ad, page);
+
+	spin_lock_irqsave(&ad->lock, flags);
+	while (true) {
+		struct request *rq = next_request(ad);
+		if (!rq)
+			break;
+
+		struct adios_rq_data *rd = get_rq_data(rq);
+		u8 optype = adios_optype(rq);
+		current_lat += rd->pred_lat;
+
+		// Check batch size and total predicted latency
+		if (count && (!ad->latency_model[optype].base || 
+			ad->batch_count[page][optype] >= ad->batch_limit[optype] ||
+			current_lat > ad->global_latency_window)) {
+			break;
+		}
+
+		remove_request(ad, rq);
+
+		// Add request to the corresponding batch queue
+		list_add_tail(&rq->queuelist, &ad->batch_queue[page][optype]);
+		ad->batch_count[page][optype]++;
+		atomic64_add(rd->pred_lat, &ad->total_pred_lat);
+		optype_count[optype]++;
+		count++;
+	}
+	spin_unlock_irqrestore(&ad->lock, flags);
+
+	if (count) {
+		ad->more_bq_ready = true;
+		for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++) {
+			if (ad->batch_actual_max_size[optype] < optype_count[optype])
+				ad->batch_actual_max_size[optype] = optype_count[optype];
+		}
+		if (ad->batch_actual_max_total < count)
+			ad->batch_actual_max_total = count;
+	}
+	return count;
+}
+
+// Flip to the next batch queue page
+static void flip_bq_page(struct adios_data *ad) {
+	ad->more_bq_ready = false;
+	ad->bq_page = (ad->bq_page + 1) % ADIOS_BQ_PAGES;
+}
+
+// Dispatch a request from the batch queues
+static struct request *dispatch_from_bq(struct adios_data *ad) {
+	struct request *rq = NULL;
+	u64 tpl;
+
+	guard(spinlock_irqsave)(&ad->bq_lock);
+
+	tpl = atomic64_read(&ad->total_pred_lat);
+
+	if (!ad->more_bq_ready && (!tpl ||
+			tpl < ad->global_latency_window * ad->bq_refill_below_ratio / 100))
+		fill_batch_queues(ad, tpl);
+
+again:
+	// Check if there are any requests in the batch queues
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+		if (!list_empty(&ad->batch_queue[ad->bq_page][i])) {
+			rq = list_first_entry(&ad->batch_queue[ad->bq_page][i],
+									struct request, queuelist);
+			list_del_init(&rq->queuelist);
+			return rq;
+		}
+	}
+
+	// If there's more batch queue page available, flip to it and retry
+	if (ad->more_bq_ready) {
+		flip_bq_page(ad);
+		goto again;
+	}
+
+	return NULL;
+}
+
+// Dispatch a request from the priority queue
+static struct request *dispatch_from_pq(struct adios_data *ad) {
+	struct request *rq = NULL;
+
+	guard(spinlock_irqsave)(&ad->pq_lock);
+
+	if (!list_empty(&ad->prio_queue)) {
+		rq = list_first_entry(&ad->prio_queue, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+	}
+	return rq;
+}
+
+// Dispatch a request to the hardware queue
+static struct request *adios_dispatch_request(struct blk_mq_hw_ctx *hctx) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+
+	rq = dispatch_from_pq(ad);
+	if (rq) goto found;
+	rq = dispatch_from_bq(ad);
+	if (!rq) return NULL;
+found:
+	rq->rq_flags |= RQF_STARTED;
+	return rq;
+}
+
+// Timer callback function to periodically update latency models
+static void update_timer_callback(struct timer_list *t) {
+	struct adios_data *ad = from_timer(ad, t, update_timer);
+
+	for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++)
+		latency_model_update(&ad->latency_model[optype]);
+}
+
+// Handle the completion of a request
+static void adios_completed_request(struct request *rq, u64 now) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+	struct adios_rq_data *rd = get_rq_data(rq);
+
+	atomic64_sub(rd->pred_lat, &ad->total_pred_lat);
+
+	if (!rq->io_start_time_ns || !rd->block_size)
+		return;
+	u64 latency = now - rq->io_start_time_ns;
+	u8 optype = adios_optype(rq);
+	latency_model_input(&ad->latency_model[optype],
+		rd->block_size, latency, rd->pred_lat);
+	timer_reduce(&ad->update_timer, jiffies + msecs_to_jiffies(100));
+}
+
+// Clean up after a request is finished
+static void adios_finish_request(struct request *rq) {
+	struct adios_data *ad = rq->q->elevator->elevator_data;
+
+	if (rq->elv.priv[0]) {
+		// Free adios_rq_data back to the memory pool
+		kmem_cache_free(ad->rq_data_pool, get_rq_data(rq));
+		rq->elv.priv[0] = NULL;
+	}
+}
+
+static inline bool pq_has_work(struct adios_data *ad) {
+	guard(spinlock_irqsave)(&ad->pq_lock);
+	return !list_empty(&ad->prio_queue);
+}
+
+static inline bool bq_has_work(struct adios_data *ad) {
+	guard(spinlock_irqsave)(&ad->bq_lock);
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++)
+		if (!list_empty(&ad->batch_queue[ad->bq_page][i]))
+			return true;
+
+	return ad->more_bq_ready;
+}
+
+static inline bool dl_tree_has_work(struct adios_data *ad) {
+	guard(spinlock_irqsave)(&ad->lock);
+	return ad->dl_queued;
+}
+
+// Check if there are any requests available for dispatch
+static bool adios_has_work(struct blk_mq_hw_ctx *hctx) {
+	struct adios_data *ad = hctx->queue->elevator->elevator_data;
+
+	return pq_has_work(ad) || bq_has_work(ad) || dl_tree_has_work(ad);
+}
+
+// Initialize the scheduler-specific data for a hardware queue
+static int adios_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) {
+	adios_depth_updated(hctx);
+	return 0;
+}
+
+// Initialize the scheduler-specific data when initializing the request queue
+static int adios_init_sched(struct request_queue *q, struct elevator_type *e) {
+	struct adios_data *ad;
+	struct elevator_queue *eq;
+	int ret = -ENOMEM;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return ret;
+
+	ad = kzalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
+	if (!ad)
+		goto put_eq;
+
+	// Create a memory pool for adios_rq_data
+	ad->rq_data_pool = kmem_cache_create("rq_data_pool",
+						sizeof(struct adios_rq_data),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ad->rq_data_pool) {
+		pr_err("adios: Failed to create rq_data_pool\n");
+		goto free_ad;
+	}
+
+	/* Create a memory pool for dl_group */
+	ad->dl_group_pool = kmem_cache_create("dl_group_pool",
+						sizeof(struct dl_group),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ad->dl_group_pool) {
+		pr_err("adios: Failed to create dl_group_pool\n");
+		goto destroy_rq_data_pool;
+	}
+
+	eq->elevator_data = ad;
+	
+	ad->global_latency_window = default_global_latency_window;
+	ad->bq_refill_below_ratio = default_bq_refill_below_ratio;
+
+	INIT_LIST_HEAD(&ad->prio_queue);
+	for (u8 i = 0; i < 2; i++)
+		ad->dl_tree[i] = RB_ROOT_CACHED;
+	ad->dl_bias = 0;
+	ad->dl_queued = 0x0;
+	for (u8 i = 0; i < 2; i++)
+		ad->dl_prio[i] = default_dl_prio[i];
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+		struct latency_model *model = &ad->latency_model[i];
+		spin_lock_init(&model->lock);
+		spin_lock_init(&model->buckets_lock);
+		memset(model->small_bucket, 0,
+			sizeof(model->small_bucket[0]) * LM_LAT_BUCKET_COUNT);
+		memset(model->large_bucket, 0,
+			sizeof(model->large_bucket[0]) * LM_LAT_BUCKET_COUNT);
+		model->last_update_jiffies = jiffies;
+		model->lm_shrink_at_kreqs  = default_lm_shrink_at_kreqs;
+		model->lm_shrink_at_gbytes = default_lm_shrink_at_gbytes;
+		model->lm_shrink_resist    = default_lm_shrink_resist;
+
+		ad->latency_target[i] = default_latency_target[i];
+		ad->batch_limit[i] = default_batch_limit[i];
+	}
+	timer_setup(&ad->update_timer, update_timer_callback, 0);
+	init_batch_queues(ad);
+
+	spin_lock_init(&ad->lock);
+	spin_lock_init(&ad->pq_lock);
+	spin_lock_init(&ad->bq_lock);
+
+	/* We dispatch from request queue wide instead of hw queue */
+	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
+	q->elevator = eq;
+	return 0;
+
+destroy_rq_data_pool:
+	kmem_cache_destroy(ad->rq_data_pool);
+free_ad:
+	kfree(ad);
+put_eq:
+	kobject_put(&eq->kobj);
+	return ret;
+}
+
+// Clean up and free resources when exiting the scheduler
+static void adios_exit_sched(struct elevator_queue *e) {
+	struct adios_data *ad = e->elevator_data;
+
+	timer_shutdown_sync(&ad->update_timer);
+
+	WARN_ON_ONCE(!list_empty(&ad->prio_queue));
+
+	if (ad->rq_data_pool)
+		kmem_cache_destroy(ad->rq_data_pool);
+
+	if (ad->dl_group_pool)
+		kmem_cache_destroy(ad->dl_group_pool);
+
+	kfree(ad);
+}
+
+// Define sysfs attributes for read operation latency model
+#define SYSFS_OPTYPE_DECL(name, optype)					\
+static ssize_t adios_lat_model_##name##_show(				\
+		struct elevator_queue *e, char *page) {				\
+	struct adios_data *ad = e->elevator_data;				\
+	struct latency_model *model = &ad->latency_model[optype];		\
+	ssize_t len = 0;						\
+	guard(spinlock_irqsave)(&model->lock);				\
+	len += sprintf(page,       "base : %llu ns\n", model->base);	\
+	len += sprintf(page + len, "slope: %llu ns/KiB\n", model->slope);\
+	return len;							\
+}									\
+static ssize_t adios_lat_target_##name##_store(				\
+		struct elevator_queue *e, const char *page, size_t count) {	\
+	struct adios_data *ad = e->elevator_data;				\
+	unsigned long nsec;						\
+	int ret;							\
+	ret = kstrtoul(page, 10, &nsec);					\
+	if (ret)							\
+		return ret;						\
+	ad->latency_model[optype].base = 0ULL;				\
+	ad->latency_target[optype] = nsec;				\
+	return count;							\
+}									\
+static ssize_t adios_lat_target_##name##_show(				\
+		struct elevator_queue *e, char *page) {				\
+	struct adios_data *ad = e->elevator_data;				\
+	return sprintf(page, "%llu\n", ad->latency_target[optype]);	\
+}									\
+static ssize_t adios_batch_limit_##name##_store(			\
+		struct elevator_queue *e, const char *page, size_t count) {	\
+	unsigned long max_batch;					\
+	int ret;							\
+	ret = kstrtoul(page, 10, &max_batch);				\
+	if (ret || max_batch == 0)					\
+		return -EINVAL;						\
+	struct adios_data *ad = e->elevator_data;				\
+	ad->batch_limit[optype] = max_batch;				\
+	return count;							\
+}									\
+static ssize_t adios_batch_limit_##name##_show(				\
+		struct elevator_queue *e, char *page) {				\
+	struct adios_data *ad = e->elevator_data;				\
+	return sprintf(page, "%u\n", ad->batch_limit[optype]);		\
+}
+
+SYSFS_OPTYPE_DECL(read, ADIOS_READ);
+SYSFS_OPTYPE_DECL(write, ADIOS_WRITE);
+SYSFS_OPTYPE_DECL(discard, ADIOS_DISCARD);
+
+// Show the maximum batch size actually achieved for each operation type
+static ssize_t adios_batch_actual_max_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	u32 total_count, read_count, write_count, discard_count;
+
+	total_count = ad->batch_actual_max_total;
+	read_count = ad->batch_actual_max_size[ADIOS_READ];
+	write_count = ad->batch_actual_max_size[ADIOS_WRITE];
+	discard_count = ad->batch_actual_max_size[ADIOS_DISCARD];
+
+	return sprintf(page,
+		"Total  : %u\nDiscard: %u\nRead   : %u\nWrite  : %u\n",
+		total_count, discard_count, read_count, write_count);
+}
+
+// Set the global latency window
+static ssize_t adios_global_latency_window_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	unsigned long nsec;
+	int ret;
+
+	ret = kstrtoul(page, 10, &nsec);
+	if (ret)
+		return ret;
+
+	ad->global_latency_window = nsec;
+
+	return count;
+}
+
+// Show the global latency window
+static ssize_t adios_global_latency_window_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	return sprintf(page, "%llu\n", ad->global_latency_window);
+}
+
+// Show the bq_refill_below_ratio
+static ssize_t adios_bq_refill_below_ratio_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	return sprintf(page, "%d\n", ad->bq_refill_below_ratio);
+}
+
+// Set the bq_refill_below_ratio
+static ssize_t adios_bq_refill_below_ratio_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	int ratio;
+	int ret;
+
+	ret = kstrtoint(page, 10, &ratio);
+	if (ret || ratio < 0 || ratio > 100)
+		return -EINVAL;
+
+	ad->bq_refill_below_ratio = ratio;
+
+	return count;
+}
+
+// Show the read priority
+static ssize_t adios_read_priority_show(
+		struct elevator_queue *e, char *page) {
+	struct adios_data *ad = e->elevator_data;
+	return sprintf(page, "%d\n", ad->dl_prio[0]);
+}
+
+// Set the read priority
+static ssize_t adios_read_priority_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	int prio;
+	int ret;
+
+	ret = kstrtoint(page, 10, &prio);
+	if (ret || prio < -20 || prio > 19)
+		return -EINVAL;
+
+	guard(spinlock_irqsave)(&ad->lock);
+	ad->dl_prio[0] = prio;
+	ad->dl_bias = 0;
+
+	return count;
+}
+
+// Reset batch queue statistics
+static ssize_t adios_reset_bq_stats_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 10, &val);
+	if (ret || val != 1)
+		return -EINVAL;
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++)
+		ad->batch_actual_max_size[i] = 0;
+
+	ad->batch_actual_max_total = 0;
+
+	return count;
+}
+
+// Reset the latency model parameters
+static ssize_t adios_reset_lat_model_store(
+		struct elevator_queue *e, const char *page, size_t count) {
+	struct adios_data *ad = e->elevator_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 10, &val);
+	if (ret || val != 1)
+		return -EINVAL;
+
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) {
+		struct latency_model *model = &ad->latency_model[i];
+		unsigned long flags;
+		spin_lock_irqsave(&model->lock, flags);
+		model->base = 0ULL;
+		model->slope = 0ULL;
+		model->small_sum_delay = 0ULL;
+		model->small_count = 0ULL;
+		model->large_sum_delay = 0ULL;
+		model->large_sum_bsize = 0ULL;
+		spin_unlock_irqrestore(&model->lock, flags);
+	}
+
+	return count;
+}
+
+// Show the ADIOS version
+static ssize_t adios_version_show(struct elevator_queue *e, char *page) {
+	return sprintf(page, "%s\n", ADIOS_VERSION);
+}
+
+// Define sysfs attributes for dynamic thresholds
+#define SHRINK_THRESHOLD_ATTR_RW(name, model_field, min_value, max_value) \
+static ssize_t adios_shrink_##name##_store( \
+		struct elevator_queue *e, const char *page, size_t count) { \
+	struct adios_data *ad = e->elevator_data; \
+	unsigned long val; \
+	int ret; \
+	ret = kstrtoul(page, 10, &val); \
+	if (ret || val < min_value || val > max_value) \
+		return -EINVAL; \
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \
+		struct latency_model *model = &ad->latency_model[i]; \
+		unsigned long flags; \
+		spin_lock_irqsave(&model->lock, flags); \
+		model->model_field = val; \
+		spin_unlock_irqrestore(&model->lock, flags); \
+	} \
+	return count; \
+} \
+static ssize_t adios_shrink_##name##_show( \
+		struct elevator_queue *e, char *page) { \
+	struct adios_data *ad = e->elevator_data; \
+	u32 val = 0; \
+	for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \
+		struct latency_model *model = &ad->latency_model[i]; \
+		unsigned long flags; \
+		spin_lock_irqsave(&model->lock, flags); \
+		val = model->model_field; \
+		spin_unlock_irqrestore(&model->lock, flags); \
+	} \
+	return sprintf(page, "%u\n", val); \
+}
+
+SHRINK_THRESHOLD_ATTR_RW(at_kreqs,  lm_shrink_at_kreqs,  1, 100000)
+SHRINK_THRESHOLD_ATTR_RW(at_gbytes, lm_shrink_at_gbytes, 1,   1000)
+SHRINK_THRESHOLD_ATTR_RW(resist,    lm_shrink_resist,    1,      3)
+
+// Define sysfs attributes
+#define AD_ATTR(name, show_func, store_func) \
+	__ATTR(name, 0644, show_func, store_func)
+#define AD_ATTR_RW(name) \
+	__ATTR(name, 0644, adios_##name##_show, adios_##name##_store)
+#define AD_ATTR_RO(name) \
+	__ATTR(name, 0644, adios_##name##_show, NULL)
+#define AD_ATTR_WO(name) \
+	__ATTR(name, 0644, NULL, adios_##name##_store)
+
+// Define sysfs attributes for ADIOS scheduler
+static struct elv_fs_entry adios_sched_attrs[] = {
+	AD_ATTR_RO(batch_actual_max),
+	AD_ATTR_RW(bq_refill_below_ratio),
+	AD_ATTR_RW(global_latency_window),
+
+	AD_ATTR_RW(batch_limit_read),
+	AD_ATTR_RW(batch_limit_write),
+	AD_ATTR_RW(batch_limit_discard),
+
+	AD_ATTR_RO(lat_model_read),
+	AD_ATTR_RO(lat_model_write),
+	AD_ATTR_RO(lat_model_discard),
+
+	AD_ATTR_RW(lat_target_read),
+	AD_ATTR_RW(lat_target_write),
+	AD_ATTR_RW(lat_target_discard),
+
+	AD_ATTR_RW(shrink_at_kreqs),
+	AD_ATTR_RW(shrink_at_gbytes),
+	AD_ATTR_RW(shrink_resist),
+
+	AD_ATTR_RW(read_priority),
+
+	AD_ATTR_WO(reset_bq_stats),
+	AD_ATTR_WO(reset_lat_model),
+	AD_ATTR(adios_version, adios_version_show, NULL),
+
+	__ATTR_NULL
+};
+
+// Define the ADIOS scheduler type
+static struct elevator_type mq_adios = {
+	.ops = {
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.limit_depth		= adios_limit_depth,
+		.depth_updated		= adios_depth_updated,
+		.request_merged		= adios_request_merged,
+		.requests_merged	= adios_merged_requests,
+		.bio_merge			= adios_bio_merge,
+		.insert_requests	= adios_insert_requests,
+		.prepare_request	= adios_prepare_request,
+		.dispatch_request	= adios_dispatch_request,
+		.completed_request	= adios_completed_request,
+		.finish_request		= adios_finish_request,
+		.has_work			= adios_has_work,
+		.init_hctx			= adios_init_hctx,
+		.init_sched			= adios_init_sched,
+		.exit_sched			= adios_exit_sched,
+	},
+#ifdef CONFIG_BLK_DEBUG_FS
+#endif
+	.elevator_attrs = adios_sched_attrs,
+	.elevator_name = "adios",
+	.elevator_owner = THIS_MODULE,
+};
+MODULE_ALIAS("mq-adios-iosched");
+
+#define ADIOS_PROGNAME "Adaptive Deadline I/O Scheduler"
+#define ADIOS_AUTHOR   "Masahito Suzuki"
+
+// Initialize the ADIOS scheduler module
+static int __init adios_init(void) {
+	printk(KERN_INFO "%s %s by %s\n",
+		ADIOS_PROGNAME, ADIOS_VERSION, ADIOS_AUTHOR);
+	return elv_register(&mq_adios);
+}
+
+// Exit the ADIOS scheduler module
+static void __exit adios_exit(void) {
+	elv_unregister(&mq_adios);
+}
+
+module_init(adios_init);
+module_exit(adios_exit);
+
+MODULE_AUTHOR(ADIOS_AUTHOR);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(ADIOS_PROGNAME);
\ No newline at end of file
diff --git a/block/elevator.c b/block/elevator.c
index 7c3ba80e5ff4..06e974eb6594 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -566,9 +566,17 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
 
 	if (q->nr_hw_queues != 1 &&
 	    !blk_mq_is_shared_tags(q->tag_set->flags))
+#if defined(CONFIG_CACHY)
+		return elevator_find_get("mq-deadline");
+#else
 		return NULL;
+#endif
 
+#if defined(CONFIG_CACHY) && defined(CONFIG_IOSCHED_BFQ)
+	return elevator_find_get("bfq");
+#else
 	return elevator_find_get("mq-deadline");
+#endif
 }
 
 /*
diff --git a/drivers/Makefile b/drivers/Makefile
index 45d1c3e630f7..4f5ab2429a7f 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,14 +64,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb depends on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -83,6 +77,13 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb depends on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 8d27c567be1c..479477438de0 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
 }
 #endif
 
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
 		struct ahci_host_priv *hpriv)
 {
 	int i;
@@ -1631,7 +1631,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	    pci_resource_len(pdev, bar) < SZ_512K ||
 	    bar != AHCI_PCI_BAR_STANDARD ||
 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
-		return;
+		return 0;
 
 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1646,18 +1646,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	}
 
 	if (!hpriv->remapped_nvme)
-		return;
-
-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
-		 hpriv->remapped_nvme);
-	dev_warn(&pdev->dev,
-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+		return 0;
 
-	/*
-	 * Don't rely on the msi-x capability in the remap case,
-	 * share the legacy interrupt across ahci and remapped devices.
-	 */
-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
+	/* Abort probe, allowing intel-nvme-remap to step in when available */
+	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+	return -ENODEV;
 }
 
 static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1898,7 +1891,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return -ENOMEM;
 
 	/* detect remapped nvme devices */
-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	if (rc)
+		return rc;
 
 	sysfs_add_file_to_group(&pdev->dev.kobj,
 				&dev_attr_remapped_nvme.attr,
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 97c2d4f15d76..5a3af44d785a 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
 	select ACPI_PROCESSOR if ACPI
 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
 	select CPU_FREQ_GOV_PERFORMANCE
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
 	depends on X86 && ACPI
 	select ACPI_PROCESSOR
 	select ACPI_CPPC_LIB if X86_64
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver adds a CPUFreq driver which utilizes a fine grain
 	  processor performance frequency control range instead of legacy
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index b8e2396a708a..d8e529cd454d 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3819,6 +3819,8 @@ static int __init intel_pstate_setup(char *str)
 
 	if (!strcmp(str, "disable"))
 		no_load = 1;
+	else if (!strcmp(str, "enable"))
+		no_load = 0;
 	else if (!strcmp(str, "active"))
 		default_driver = &intel_pstate;
 	else if (!strcmp(str, "passive"))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4653a8d2823a..6590e83dfbf0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -160,6 +160,7 @@ struct amdgpu_watchdog_timer {
  */
 extern int amdgpu_modeset;
 extern unsigned int amdgpu_vram_limit;
+extern int amdgpu_ignore_min_pcap;
 extern int amdgpu_vis_vram_limit;
 extern int amdgpu_gart_size;
 extern int amdgpu_gtt_size;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index 093141ad6ed0..e476e45b996a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -36,13 +36,6 @@
 #include "atombios_encoders.h"
 #include "bif/bif_4_1_d.h"
 
-static void amdgpu_atombios_lookup_i2c_gpio_quirks(struct amdgpu_device *adev,
-					  ATOM_GPIO_I2C_ASSIGMENT *gpio,
-					  u8 index)
-{
-
-}
-
 static struct amdgpu_i2c_bus_rec amdgpu_atombios_get_bus_rec_for_i2c_gpio(ATOM_GPIO_I2C_ASSIGMENT *gpio)
 {
 	struct amdgpu_i2c_bus_rec i2c;
@@ -108,9 +101,6 @@ struct amdgpu_i2c_bus_rec amdgpu_atombios_lookup_i2c_gpio(struct amdgpu_device *
 
 		gpio = &i2c_info->asGPIO_Info[0];
 		for (i = 0; i < num_indices; i++) {
-
-			amdgpu_atombios_lookup_i2c_gpio_quirks(adev, gpio, i);
-
 			if (gpio->sucI2cId.ucAccess == id) {
 				i2c = amdgpu_atombios_get_bus_rec_for_i2c_gpio(gpio);
 				break;
@@ -142,8 +132,6 @@ void amdgpu_atombios_i2c_init(struct amdgpu_device *adev)
 
 		gpio = &i2c_info->asGPIO_Info[0];
 		for (i = 0; i < num_indices; i++) {
-			amdgpu_atombios_lookup_i2c_gpio_quirks(adev, gpio, i);
-
 			i2c = amdgpu_atombios_get_bus_rec_for_i2c_gpio(gpio);
 
 			if (i2c.valid) {
@@ -156,6 +144,38 @@ void amdgpu_atombios_i2c_init(struct amdgpu_device *adev)
 	}
 }
 
+void amdgpu_atombios_oem_i2c_init(struct amdgpu_device *adev, u8 i2c_id)
+{
+	struct atom_context *ctx = adev->mode_info.atom_context;
+	ATOM_GPIO_I2C_ASSIGMENT *gpio;
+	struct amdgpu_i2c_bus_rec i2c;
+	int index = GetIndexIntoMasterTable(DATA, GPIO_I2C_Info);
+	struct _ATOM_GPIO_I2C_INFO *i2c_info;
+	uint16_t data_offset, size;
+	int i, num_indices;
+	char stmp[32];
+
+	if (amdgpu_atom_parse_data_header(ctx, index, &size, NULL, NULL, &data_offset)) {
+		i2c_info = (struct _ATOM_GPIO_I2C_INFO *)(ctx->bios + data_offset);
+
+		num_indices = (size - sizeof(ATOM_COMMON_TABLE_HEADER)) /
+			sizeof(ATOM_GPIO_I2C_ASSIGMENT);
+
+		gpio = &i2c_info->asGPIO_Info[0];
+		for (i = 0; i < num_indices; i++) {
+			i2c = amdgpu_atombios_get_bus_rec_for_i2c_gpio(gpio);
+
+			if (i2c.valid && i2c.i2c_id == i2c_id) {
+				sprintf(stmp, "OEM 0x%x", i2c.i2c_id);
+				adev->i2c_bus[i] = amdgpu_i2c_create(adev_to_drm(adev), &i2c, stmp);
+				break;
+			}
+			gpio = (ATOM_GPIO_I2C_ASSIGMENT *)
+				((u8 *)gpio + sizeof(ATOM_GPIO_I2C_ASSIGMENT));
+		}
+	}
+}
+
 struct amdgpu_gpio_rec
 amdgpu_atombios_lookup_gpio(struct amdgpu_device *adev,
 			    u8 id)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h
index 0e16432d9a72..867bc5c5ce67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h
@@ -136,6 +136,7 @@ amdgpu_atombios_lookup_gpio(struct amdgpu_device *adev,
 struct amdgpu_i2c_bus_rec amdgpu_atombios_lookup_i2c_gpio(struct amdgpu_device *adev,
 							  uint8_t id);
 void amdgpu_atombios_i2c_init(struct amdgpu_device *adev);
+void amdgpu_atombios_oem_i2c_init(struct amdgpu_device *adev, u8 i2c_id);
 
 bool amdgpu_atombios_has_dce_engine_info(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3780d50fd3ae..6f622c9944e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4468,8 +4468,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 				goto failed;
 			}
 			/* init i2c buses */
-			if (!amdgpu_device_has_dc_support(adev))
-				amdgpu_atombios_i2c_init(adev);
+			amdgpu_i2c_init(adev);
 		}
 	}
 
@@ -4731,8 +4730,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
 	amdgpu_reset_fini(adev);
 
 	/* free i2c buses */
-	if (!amdgpu_device_has_dc_support(adev))
-		amdgpu_i2c_fini(adev);
+	amdgpu_i2c_fini(adev);
 
 	if (amdgpu_emu_mode != 1)
 		amdgpu_atombios_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 91a874bb0e24..32298a4a7fa7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -139,6 +139,7 @@ enum AMDGPU_DEBUG_MASK {
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
+int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */
 int amdgpu_vis_vram_limit;
 int amdgpu_gart_size = -1; /* auto */
 int amdgpu_gtt_size = -1; /* auto */
@@ -257,6 +258,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
 	.period = 0x0, /* default to 0x0 (timeout disable) */
 };
 
+/**
+ * DOC: ignore_min_pcap (int)
+ * Ignore the minimum power cap.
+ * Useful on graphics cards where the minimum power cap is very high.
+ * The default is 0 (Do not ignore).
+ */
+MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap");
+module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600);
+
 /**
  * DOC: vramlimit (int)
  * Restrict the total amount of VRAM in MiB for testing.  The default is 0 (Use full VRAM).
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c
index f0765ccde668..8179d0814db9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c
@@ -225,6 +225,25 @@ void amdgpu_i2c_destroy(struct amdgpu_i2c_chan *i2c)
 	kfree(i2c);
 }
 
+void amdgpu_i2c_init(struct amdgpu_device *adev)
+{
+	if (!adev->is_atom_fw) {
+		if (!amdgpu_device_has_dc_support(adev)) {
+			amdgpu_atombios_i2c_init(adev);
+		} else {
+			switch (adev->asic_type) {
+			case CHIP_POLARIS10:
+			case CHIP_POLARIS11:
+			case CHIP_POLARIS12:
+				amdgpu_atombios_oem_i2c_init(adev, 0x97);
+				break;
+			default:
+				break;
+			}
+		}
+	}
+}
+
 /* remove all the buses */
 void amdgpu_i2c_fini(struct amdgpu_device *adev)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.h
index 21e3d1dad0a1..1d3d3806e0dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.h
@@ -28,6 +28,7 @@ struct amdgpu_i2c_chan *amdgpu_i2c_create(struct drm_device *dev,
 					  const struct amdgpu_i2c_bus_rec *rec,
 					  const char *name);
 void amdgpu_i2c_destroy(struct amdgpu_i2c_chan *i2c);
+void amdgpu_i2c_init(struct amdgpu_device *adev);
 void amdgpu_i2c_fini(struct amdgpu_device *adev);
 struct amdgpu_i2c_chan *
 amdgpu_i2c_lookup(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
index 5e3faefc5510..6da4f946cac0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
@@ -609,6 +609,7 @@ struct amdgpu_i2c_adapter {
 	struct i2c_adapter base;
 
 	struct ddc_service *ddc_service;
+	bool oem;
 };
 
 #define TO_DM_AUX(x) container_of((x), struct amdgpu_dm_dp_aux, aux)
diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig
index 11e3f2f3b174..7b1bd69dc29e 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -54,4 +54,10 @@ config DRM_AMD_SECURE_DISPLAY
 	  This option enables the calculation of crc of specific region via
 	  debugfs. Cooperate with specific DMCU FW.
 
+config AMD_PRIVATE_COLOR
+	bool "Enable KMS color management by AMD for AMD"
+	default n
+	help
+	  This option extends the KMS color management API with AMD driver-specific properties to enhance the color management support on AMD Steam Deck.
+
 endmenu
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index ca6b9a585aba..ddcb2c66c916 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -178,6 +178,8 @@ static int amdgpu_dm_init(struct amdgpu_device *adev);
 static void amdgpu_dm_fini(struct amdgpu_device *adev);
 static bool is_freesync_video_mode(const struct drm_display_mode *mode, struct amdgpu_dm_connector *aconnector);
 static void reset_freesync_config_for_crtc(struct dm_crtc_state *new_crtc_state);
+static struct amdgpu_i2c_adapter *
+create_i2c(struct ddc_service *ddc_service, bool oem);
 
 static enum drm_mode_subconnector get_subconnector_type(struct dc_link *link)
 {
@@ -2897,6 +2899,33 @@ static int amdgpu_dm_smu_write_watermarks_table(struct amdgpu_device *adev)
 	return 0;
 }
 
+static int dm_oem_i2c_hw_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_display_manager *dm = &adev->dm;
+	struct amdgpu_i2c_adapter *oem_i2c;
+	struct ddc_service *oem_ddc_service;
+	int r;
+
+	oem_ddc_service = dc_get_oem_i2c_device(adev->dm.dc);
+	if (oem_ddc_service) {
+		oem_i2c = create_i2c(oem_ddc_service, true);
+		if (!oem_i2c) {
+			dev_info(adev->dev, "Failed to create oem i2c adapter data\n");
+			return -ENOMEM;
+		}
+
+		r = i2c_add_adapter(&oem_i2c->base);
+		if (r) {
+			dev_info(adev->dev, "Failed to register oem i2c\n");
+			kfree(oem_i2c);
+			return r;
+		}
+		dm->oem_i2c = oem_i2c;
+	}
+
+	return 0;
+}
+
 /**
  * dm_hw_init() - Initialize DC device
  * @ip_block: Pointer to the amdgpu_ip_block for this hw instance.
@@ -2928,6 +2957,10 @@ static int dm_hw_init(struct amdgpu_ip_block *ip_block)
 		return r;
 	amdgpu_dm_hpd_init(adev);
 
+	r = dm_oem_i2c_hw_init(adev);
+	if (r)
+		dev_info(adev->dev, "Failed to add OEM i2c bus\n");
+
 	return 0;
 }
 
@@ -2943,6 +2976,8 @@ static int dm_hw_fini(struct amdgpu_ip_block *ip_block)
 {
 	struct amdgpu_device *adev = ip_block->adev;
 
+	kfree(adev->dm.oem_i2c);
+
 	amdgpu_dm_hpd_fini(adev);
 
 	amdgpu_dm_irq_fini(adev);
@@ -4579,7 +4614,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
 		return r;
 	}
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	if (amdgpu_dm_create_color_properties(adev)) {
 		dc_state_release(state->context);
 		kfree(state);
@@ -8290,7 +8325,7 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap,
 	int i;
 	int result = -EIO;
 
-	if (!ddc_service->ddc_pin || !ddc_service->ddc_pin->hw_info.hw_supported)
+	if (!ddc_service->ddc_pin)
 		return result;
 
 	cmd.payloads = kcalloc(num, sizeof(struct i2c_payload), GFP_KERNEL);
@@ -8309,11 +8344,18 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap,
 		cmd.payloads[i].data = msgs[i].buf;
 	}
 
-	if (dc_submit_i2c(
-			ddc_service->ctx->dc,
-			ddc_service->link->link_index,
-			&cmd))
-		result = num;
+	if (i2c->oem) {
+		if (dc_submit_i2c_oem(
+			    ddc_service->ctx->dc,
+			    &cmd))
+			result = num;
+	} else {
+		if (dc_submit_i2c(
+			    ddc_service->ctx->dc,
+			    ddc_service->link->link_index,
+			    &cmd))
+			result = num;
+	}
 
 	kfree(cmd.payloads);
 	return result;
@@ -8330,9 +8372,7 @@ static const struct i2c_algorithm amdgpu_dm_i2c_algo = {
 };
 
 static struct amdgpu_i2c_adapter *
-create_i2c(struct ddc_service *ddc_service,
-	   int link_index,
-	   int *res)
+create_i2c(struct ddc_service *ddc_service, bool oem)
 {
 	struct amdgpu_device *adev = ddc_service->ctx->driver_context;
 	struct amdgpu_i2c_adapter *i2c;
@@ -8343,9 +8383,14 @@ create_i2c(struct ddc_service *ddc_service,
 	i2c->base.owner = THIS_MODULE;
 	i2c->base.dev.parent = &adev->pdev->dev;
 	i2c->base.algo = &amdgpu_dm_i2c_algo;
-	snprintf(i2c->base.name, sizeof(i2c->base.name), "AMDGPU DM i2c hw bus %d", link_index);
+	if (oem)
+		snprintf(i2c->base.name, sizeof(i2c->base.name), "AMDGPU DM i2c OEM bus");
+	else
+		snprintf(i2c->base.name, sizeof(i2c->base.name), "AMDGPU DM i2c hw bus %d",
+			 ddc_service->link->link_index);
 	i2c_set_adapdata(&i2c->base, i2c);
 	i2c->ddc_service = ddc_service;
+	i2c->oem = oem;
 
 	return i2c;
 }
@@ -8370,7 +8415,7 @@ static int amdgpu_dm_connector_init(struct amdgpu_display_manager *dm,
 	link->priv = aconnector;
 
 
-	i2c = create_i2c(link->ddc, link->link_index, &res);
+	i2c = create_i2c(link->ddc, false);
 	if (!i2c) {
 		DRM_ERROR("Failed to create i2c adapter data\n");
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 2227cd8e4a89..5710776bb0e2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -606,6 +606,13 @@ struct amdgpu_display_manager {
 	 * Bounding box data read from dmub during early initialization for DCN4+
 	 */
 	struct dml2_soc_bb *bb_from_dmub;
+
+	/**
+	 * @oem_i2c:
+	 *
+	 * OEM i2c bus
+	 */
+	struct amdgpu_i2c_adapter *oem_i2c;
 };
 
 enum dsc_clock_force_state {
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
index ebabfe3a512f..4d3ebcaacca1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
@@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x)
 	return val;
 }
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 /* Pre-defined Transfer Functions (TF)
  *
  * AMD driver supports pre-defined mathematical functions for transferring
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
index 36a830a7440f..a8fc8bd52d51 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
@@ -470,7 +470,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc)
 }
 #endif
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 /**
  * dm_crtc_additional_color_mgmt - enable additional color properties
  * @crtc: DRM CRTC
@@ -552,7 +552,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
 #if defined(CONFIG_DEBUG_FS)
 	.late_register = amdgpu_dm_crtc_late_register,
 #endif
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	.atomic_set_property = amdgpu_dm_atomic_crtc_set_property,
 	.atomic_get_property = amdgpu_dm_atomic_crtc_get_property,
 #endif
@@ -731,7 +731,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm,
 
 	drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES);
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	dm_crtc_additional_color_mgmt(&acrtc->base);
 #endif
 	return 0;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
index 83c7c8853ede..059d0ca29518 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
@@ -1567,7 +1567,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane,
 	drm_atomic_helper_plane_destroy_state(plane, state);
 }
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 static void
 dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
 					     struct drm_plane *plane)
@@ -1758,7 +1758,7 @@ static const struct drm_plane_funcs dm_plane_funcs = {
 	.atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state,
 	.atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state,
 	.format_mod_supported = amdgpu_dm_plane_format_mod_supported,
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	.atomic_set_property = dm_atomic_plane_set_property,
 	.atomic_get_property = dm_atomic_plane_get_property,
 #endif
@@ -1851,7 +1851,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
 
 	drm_plane_helper_add(plane, &dm_plane_helper_funcs);
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	dm_atomic_plane_attach_color_mgmt_properties(dm, plane);
 #endif
 	/* Create (reset) the plane state */
diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
index c9a6de110b74..470ec970217b 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
@@ -1778,6 +1778,7 @@ static enum bp_result get_firmware_info_v3_1(
 	struct dc_firmware_info *info)
 {
 	struct atom_firmware_info_v3_1 *firmware_info;
+	struct atom_firmware_info_v3_2 *firmware_info32;
 	struct atom_display_controller_info_v4_1 *dce_info = NULL;
 
 	if (!info)
@@ -1785,6 +1786,8 @@ static enum bp_result get_firmware_info_v3_1(
 
 	firmware_info = GET_IMAGE(struct atom_firmware_info_v3_1,
 			DATA_TABLES(firmwareinfo));
+	firmware_info32 = GET_IMAGE(struct atom_firmware_info_v3_2,
+			DATA_TABLES(firmwareinfo));
 
 	dce_info = GET_IMAGE(struct atom_display_controller_info_v4_1,
 			DATA_TABLES(dce_info));
@@ -1817,7 +1820,15 @@ static enum bp_result get_firmware_info_v3_1(
 				bp->cmd_tbl.get_smu_clock_info(bp, SMU9_SYSPLL0_ID) * 10;
 	}
 
-	info->oem_i2c_present = false;
+	/* These fields are marked as reserved in v3_1, but they appear to be populated
+	 * properly.
+	 */
+	if (firmware_info32->board_i2c_feature_id == 0x2) {
+		info->oem_i2c_present = true;
+		info->oem_i2c_obj_id = firmware_info32->board_i2c_feature_gpio_id;
+	} else {
+		info->oem_i2c_present = false;
+	}
 
 	return BP_RESULT_OK;
 }
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c
index 457d60eeb486..13636eb4ec3f 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_exports.c
@@ -142,6 +142,12 @@ bool dc_link_update_dsc_config(struct pipe_ctx *pipe_ctx)
 	return link->dc->link_srv->update_dsc_config(pipe_ctx);
 }
 
+struct ddc_service *
+dc_get_oem_i2c_device(struct dc *dc)
+{
+	return dc->res_pool->oem_device;
+}
+
 bool dc_is_oem_i2c_device_present(
 	struct dc *dc,
 	size_t slave_address)
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h
index 08c5a315b3a6..70d6005ecd64 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -1939,6 +1939,9 @@ int dc_link_aux_transfer_raw(struct ddc_service *ddc,
 		struct aux_payload *payload,
 		enum aux_return_code_type *operation_result);
 
+struct ddc_service *
+dc_get_oem_i2c_device(struct dc *dc);
+
 bool dc_is_oem_i2c_device_present(
 	struct dc *dc,
 	size_t slave_address
diff --git a/drivers/gpu/drm/amd/display/dc/resource/dce120/dce120_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dce120/dce120_resource.c
index c63c59623433..eb1e158d3436 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dce120/dce120_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dce120/dce120_resource.c
@@ -67,6 +67,7 @@
 #include "reg_helper.h"
 
 #include "dce100/dce100_resource.h"
+#include "link.h"
 
 #ifndef mmDP0_DP_DPHY_INTERNAL_CTRL
 	#define mmDP0_DP_DPHY_INTERNAL_CTRL		0x210f
@@ -659,6 +660,12 @@ static void dce120_resource_destruct(struct dce110_resource_pool *pool)
 
 	if (pool->base.dmcu != NULL)
 		dce_dmcu_destroy(&pool->base.dmcu);
+
+	if (pool->base.oem_device != NULL) {
+		struct dc *dc = pool->base.oem_device->ctx->dc;
+
+		dc->link_srv->destroy_ddc_service(&pool->base.oem_device);
+	}
 }
 
 static void read_dce_straps(
@@ -1054,6 +1061,7 @@ static bool dce120_resource_construct(
 	struct dc *dc,
 	struct dce110_resource_pool *pool)
 {
+	struct ddc_service_init_data ddc_init_data = {0};
 	unsigned int i;
 	int j;
 	struct dc_context *ctx = dc->ctx;
@@ -1257,6 +1265,15 @@ static bool dce120_resource_construct(
 
 	bw_calcs_data_update_from_pplib(dc);
 
+	if (dc->ctx->dc_bios->fw_info.oem_i2c_present) {
+		ddc_init_data.ctx = dc->ctx;
+		ddc_init_data.link = NULL;
+		ddc_init_data.id.id = dc->ctx->dc_bios->fw_info.oem_i2c_obj_id;
+		ddc_init_data.id.enum_id = 0;
+		ddc_init_data.id.type = OBJECT_TYPE_GENERIC;
+		pool->base.oem_device = dc->link_srv->create_ddc_service(&ddc_init_data);
+	}
+
 	return true;
 
 irqs_create_fail:
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index e8ae7681bf0a..8a0d873983f3 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -3178,6 +3178,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
+	if (amdgpu_ignore_min_pcap)
+		return sysfs_emit(buf, "%i\n", 0);
+
 	return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN);
 }
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c0b98749dde7..3aff1eea82ff 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2810,7 +2810,10 @@ int smu_get_power_limit(void *handle,
 			*limit = smu->max_power_limit;
 			break;
 		case SMU_PPT_LIMIT_MIN:
-			*limit = smu->min_power_limit;
+			if (amdgpu_ignore_min_pcap)
+				*limit = 0;
+			else
+				*limit = smu->min_power_limit;
 			break;
 		default:
 			return -EINVAL;
@@ -2834,7 +2837,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit)
 		if (smu->ppt_funcs->set_power_limit)
 			return smu->ppt_funcs->set_power_limit(smu, limit_type, limit);
 
-	if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
+	if (amdgpu_ignore_min_pcap) {
+		if ((limit > smu->max_power_limit)) {
+			dev_err(smu->adev->dev,
+				"New power limit (%d) is over the max allowed %d\n",
+				limit, smu->max_power_limit);
+			return -EINVAL;
+		}
+	} else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
 		dev_err(smu->adev->dev,
 			"New power limit (%d) is out of range [%d,%d]\n",
 			limit, smu->min_power_limit, smu->max_power_limit);
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index b5cbb57ee5f6..a0f7fa1518c6 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -46,6 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
+	struct rcu_head rcu;
 	enum input_clock_type clk_type;
 	bool revoked;
 	unsigned long *evmasks[EV_CNT];
@@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev,
 	spin_unlock(&evdev->client_lock);
 }
 
+static void evdev_reclaim_client(struct rcu_head *rp)
+{
+	struct evdev_client *client = container_of(rp, struct evdev_client, rcu);
+	unsigned int i;
+	for (i = 0; i < EV_CNT; ++i)
+		bitmap_free(client->evmasks[i]);
+	kvfree(client);
+}
+
 static void evdev_detach_client(struct evdev *evdev,
 				struct evdev_client *client)
 {
 	spin_lock(&evdev->client_lock);
 	list_del_rcu(&client->node);
 	spin_unlock(&evdev->client_lock);
-	synchronize_rcu();
+	call_rcu(&client->rcu, evdev_reclaim_client);
 }
 
 static int evdev_open_device(struct evdev *evdev)
@@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 {
 	struct evdev_client *client = file->private_data;
 	struct evdev *evdev = client->evdev;
-	unsigned int i;
 
 	mutex_lock(&evdev->mutex);
 
@@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 
 	evdev_detach_client(evdev, client);
 
-	for (i = 0; i < EV_CNT; ++i)
-		bitmap_free(client->evmasks[i]);
-
-	kvfree(client);
-
 	evdev_close_device(evdev);
 
 	return 0;
@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file)
 
  err_free_client:
 	evdev_detach_client(evdev, client);
-	kvfree(client);
 	return error;
 }
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 78c975d7cd5f..965043bd2093 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3308,6 +3308,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			goto bad;
 	}
 
+#ifdef CONFIG_CACHY
+	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+#endif
+
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
 	if (ret < 0)
 		goto bad;
diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
index 331b8e535e5b..80dabeebf580 100644
--- a/drivers/media/v4l2-core/Kconfig
+++ b/drivers/media/v4l2-core/Kconfig
@@ -40,6 +40,11 @@ config VIDEO_TUNER
 config V4L2_JPEG_HELPER
 	tristate
 
+config V4L2_LOOPBACK
+	tristate "V4L2 loopback device"
+	help
+	  V4L2 loopback device
+
 # Used by drivers that need v4l2-h264.ko
 config V4L2_H264
 	tristate
diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
index 2177b9d63a8f..c179507cedc4 100644
--- a/drivers/media/v4l2-core/Makefile
+++ b/drivers/media/v4l2-core/Makefile
@@ -33,5 +33,7 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o
 obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
 obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o
 
+obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o
+
 obj-$(CONFIG_VIDEO_TUNER) += tuner.o
 obj-$(CONFIG_VIDEO_DEV) += v4l2-dv-timings.o videodev.o
diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c
new file mode 100644
index 000000000000..25cb1beb26e5
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2loopback.c
@@ -0,0 +1,3184 @@
+/* -*- c-file-style: "linux" -*- */
+/*
+ * v4l2loopback.c  --  video4linux2 loopback driver
+ *
+ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com)
+ * Copyright (C) 2010-2023 IOhannes m zmoelnig (zmoelnig@iem.at)
+ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de)
+ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/module.h>
+#include <linux/videodev2.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/eventpoll.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-event.h>
+
+#include <linux/miscdevice.h>
+#include "v4l2loopback.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
+#error This module is not supported on kernels before 4.0.0.
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
+#define strscpy strlcpy
+#endif
+
+#if defined(timer_setup) && defined(from_timer)
+#define HAVE_TIMER_SETUP
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
+#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER
+#endif
+
+#define V4L2LOOPBACK_VERSION_CODE                                              \
+	KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \
+		       V4L2LOOPBACK_VERSION_BUGFIX)
+
+MODULE_DESCRIPTION("V4L2 loopback video device");
+MODULE_AUTHOR("Vasily Levin, "
+	      "IOhannes m zmoelnig <zmoelnig@iem.at>,"
+	      "Stefan Diewald,"
+	      "Anton Novikov"
+	      "et al.");
+#ifdef SNAPSHOT_VERSION
+MODULE_VERSION(__stringify(SNAPSHOT_VERSION));
+#else
+MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify(
+	V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX));
+#endif
+MODULE_LICENSE("GPL");
+
+/*
+ * helpers
+ */
+#define dprintk(fmt, args...)                                          \
+	do {                                                           \
+		if (debug > 0) {                                       \
+			printk(KERN_INFO "v4l2-loopback[" __stringify( \
+				       __LINE__) "], pid(%d):  " fmt,  \
+			       task_pid_nr(current), ##args);          \
+		}                                                      \
+	} while (0)
+
+#define MARK()                                                             \
+	do {                                                               \
+		if (debug > 1) {                                           \
+			printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \
+			       __LINE__, __func__, task_pid_nr(current));  \
+		}                                                          \
+	} while (0)
+
+#define dprintkrw(fmt, args...)                                        \
+	do {                                                           \
+		if (debug > 2) {                                       \
+			printk(KERN_INFO "v4l2-loopback[" __stringify( \
+				       __LINE__) "], pid(%d): " fmt,   \
+			       task_pid_nr(current), ##args);          \
+		}                                                      \
+	} while (0)
+
+static inline void v4l2l_get_timestamp(struct v4l2_buffer *b)
+{
+	struct timespec64 ts;
+	ktime_get_ts64(&ts);
+
+	b->timestamp.tv_sec = ts.tv_sec;
+	b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC);
+	b->flags |= V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
+}
+
+#if BITS_PER_LONG == 32
+#include <asm/div64.h> /* do_div() for 64bit division */
+static inline int v4l2l_mod64(const s64 A, const u32 B)
+{
+	u64 a = (u64)A;
+	u32 b = B;
+
+	if (A > 0)
+		return do_div(a, b);
+	a = -A;
+	return -do_div(a, b);
+}
+#else
+static inline int v4l2l_mod64(const s64 A, const u32 B)
+{
+	return A % B;
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
+typedef unsigned __poll_t;
+#endif
+
+/* module constants
+ *  can be overridden during he build process using something like
+ *	make KCPPFLAGS="-DMAX_DEVICES=100"
+ */
+
+/* maximum number of v4l2loopback devices that can be created */
+#ifndef MAX_DEVICES
+#define MAX_DEVICES 8
+#endif
+
+/* whether the default is to announce capabilities exclusively or not */
+#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
+#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0
+#endif
+
+/* when a producer is considered to have gone stale */
+#ifndef MAX_TIMEOUT
+#define MAX_TIMEOUT (100 * 1000) /* in msecs */
+#endif
+
+/* max buffers that can be mapped, actually they
+ * are all mapped to max_buffers buffers */
+#ifndef MAX_BUFFERS
+#define MAX_BUFFERS 32
+#endif
+
+/* module parameters */
+static int debug = 0;
+module_param(debug, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)");
+
+#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2
+static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS;
+module_param(max_buffers, int, S_IRUGO);
+MODULE_PARM_DESC(max_buffers,
+		 "how many buffers should be allocated [DEFAULT: " __stringify(
+			 V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]");
+
+/* how many times a device can be opened
+ * the per-module default value can be overridden on a per-device basis using
+ * the /sys/devices interface
+ *
+ * note that max_openers should be at least 2 in order to get a working system:
+ *   one opener for the producer and one opener for the consumer
+ *   however, we leave that to the user
+ */
+#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10
+static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS;
+module_param(max_openers, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(
+	max_openers,
+	"how many users can open the loopback device [DEFAULT: " __stringify(
+		V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]");
+
+static int devices = -1;
+module_param(devices, int, 0);
+MODULE_PARM_DESC(devices, "how many devices should be created");
+
+static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 };
+module_param_array(video_nr, int, NULL, 0444);
+MODULE_PARM_DESC(video_nr,
+		 "video device numbers (-1=auto, 0=/dev/video0, etc.)");
+
+static char *card_label[MAX_DEVICES];
+module_param_array(card_label, charp, NULL, 0000);
+MODULE_PARM_DESC(card_label, "card labels for each device");
+
+static bool exclusive_caps[MAX_DEVICES] = {
+	[0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
+};
+module_param_array(exclusive_caps, bool, NULL, 0444);
+/* FIXXME: wording */
+MODULE_PARM_DESC(
+	exclusive_caps,
+	"whether to announce OUTPUT/CAPTURE capabilities exclusively or not  [DEFAULT: " __stringify(
+		V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]");
+
+/* format specifications */
+#define V4L2LOOPBACK_SIZE_MIN_WIDTH 2
+#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 1
+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192
+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192
+
+#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640
+#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480
+
+static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
+module_param(max_width, int, S_IRUGO);
+MODULE_PARM_DESC(max_width,
+		 "maximum allowed frame width [DEFAULT: " __stringify(
+			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]");
+static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
+module_param(max_height, int, S_IRUGO);
+MODULE_PARM_DESC(max_height,
+		 "maximum allowed frame height [DEFAULT: " __stringify(
+			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]");
+
+static DEFINE_IDR(v4l2loopback_index_idr);
+static DEFINE_MUTEX(v4l2loopback_ctl_mutex);
+
+/* frame intervals */
+#define V4L2LOOPBACK_FPS_MIN 0
+#define V4L2LOOPBACK_FPS_MAX 1000
+
+/* control IDs */
+#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000)
+#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0)
+#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1)
+#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2)
+#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3)
+
+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl);
+static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = {
+	.s_ctrl = v4l2loopback_s_ctrl,
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_KEEP_FORMAT,
+	.name	= "keep_format",
+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
+	.min	= 0,
+	.max	= 1,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_SUSTAIN_FRAMERATE,
+	.name	= "sustain_framerate",
+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
+	.min	= 0,
+	.max	= 1,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_TIMEOUT,
+	.name	= "timeout",
+	.type	= V4L2_CTRL_TYPE_INTEGER,
+	.min	= 0,
+	.max	= MAX_TIMEOUT,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_TIMEOUT_IMAGE_IO,
+	.name	= "timeout_image_io",
+	.type	= V4L2_CTRL_TYPE_BUTTON,
+	.min	= 0,
+	.max	= 1,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+
+/* module structures */
+struct v4l2loopback_private {
+	int device_nr;
+};
+
+/* TODO(vasaka) use typenames which are common to kernel, but first find out if
+ * it is needed */
+/* struct keeping state and settings of loopback device */
+
+struct v4l2l_buffer {
+	struct v4l2_buffer buffer;
+	struct list_head list_head;
+	int use_count;
+};
+
+struct v4l2_loopback_device {
+	struct v4l2_device v4l2_dev;
+	struct v4l2_ctrl_handler ctrl_handler;
+	struct video_device *vdev;
+	/* pixel and stream format */
+	struct v4l2_pix_format pix_format;
+	bool pix_format_has_valid_sizeimage;
+	struct v4l2_captureparm capture_param;
+	unsigned long frame_jiffies;
+
+	/* ctrls */
+	int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all
+			    openers close() the device */
+	int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain
+				  (close to) nominal framerate */
+
+	/* buffers stuff */
+	u8 *image; /* pointer to actual buffers data */
+	unsigned long int imagesize; /* size of buffers data */
+	int buffers_number; /* should not be big, 4 is a good choice */
+	struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */
+	int used_buffers; /* number of the actually used buffers */
+	int max_openers; /* how many times can this device be opened */
+
+	s64 write_position; /* number of last written frame + 1 */
+	struct list_head outbufs_list; /* buffers in output DQBUF order */
+	int bufpos2index
+		[MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers)
+                        * to inner buffer index */
+	long buffer_size;
+
+	/* sustain_framerate stuff */
+	struct timer_list sustain_timer;
+	unsigned int reread_count;
+
+	/* timeout stuff */
+	unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */
+	int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will
+			       * read/write to timeout_image */
+	u8 *timeout_image; /* copy of it will be captured when timeout passes */
+	struct v4l2l_buffer timeout_image_buffer;
+	struct timer_list timeout_timer;
+	int timeout_happened;
+
+	/* sync stuff */
+	atomic_t open_count;
+
+	int ready_for_capture; /* set to the number of writers that opened the
+                                * device and negotiated format. */
+	int ready_for_output; /* set to true when no writer is currently attached
+			       * this differs slightly from !ready_for_capture,
+			       * e.g. when using fallback images */
+	int active_readers; /* increase if any reader starts streaming */
+	int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE)
+                                * should only be announced if the resp. "ready"
+                                * flag is set; default=TRUE */
+
+	int min_width, max_width;
+	int min_height, max_height;
+
+	char card_label[32];
+
+	wait_queue_head_t read_event;
+	spinlock_t lock, list_lock;
+};
+
+/* types of opener shows what opener wants to do with loopback */
+enum opener_type {
+	// clang-format off
+	UNNEGOTIATED	= 0,
+	READER		= 1,
+	WRITER		= 2,
+	// clang-format on
+};
+
+/* struct keeping state and type of opener */
+struct v4l2_loopback_opener {
+	enum opener_type type;
+	s64 read_position; /* number of last processed frame + 1 or
+			    * write_position - 1 if reader went out of sync */
+	unsigned int reread_count;
+	struct v4l2_buffer *buffers;
+	int buffers_number; /* should not be big, 4 is a good choice */
+	int timeout_image_io;
+
+	struct v4l2_fh fh;
+};
+
+#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh)
+
+/* this is heavily inspired by the bttv driver found in the linux kernel */
+struct v4l2l_format {
+	char *name;
+	int fourcc; /* video4linux 2 */
+	int depth; /* bit/pixel */
+	int flags;
+};
+/* set the v4l2l_format.flags to PLANAR for non-packed formats */
+#define FORMAT_FLAGS_PLANAR 0x01
+#define FORMAT_FLAGS_COMPRESSED 0x02
+
+#include "v4l2loopback_formats.h"
+
+#ifndef V4L2_TYPE_IS_CAPTURE
+#define V4L2_TYPE_IS_CAPTURE(type)                \
+	((type) == V4L2_BUF_TYPE_VIDEO_CAPTURE || \
+	 (type) == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
+#endif /* V4L2_TYPE_IS_CAPTURE */
+#ifndef V4L2_TYPE_IS_OUTPUT
+#define V4L2_TYPE_IS_OUTPUT(type)                \
+	((type) == V4L2_BUF_TYPE_VIDEO_OUTPUT || \
+	 (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE)
+#endif /* V4L2_TYPE_IS_OUTPUT */
+
+/* whether the format can be changed */
+/* the format is fixated if we
+   - have writers (ready_for_capture>0)
+   - and/or have readers (active_readers>0)
+*/
+#define V4L2LOOPBACK_IS_FIXED_FMT(device)                               \
+	(device->ready_for_capture > 0 || device->active_readers > 0 || \
+	 device->keep_format)
+
+static const unsigned int FORMATS = ARRAY_SIZE(formats);
+
+static char *fourcc2str(unsigned int fourcc, char buf[4])
+{
+	buf[0] = (fourcc >> 0) & 0xFF;
+	buf[1] = (fourcc >> 8) & 0xFF;
+	buf[2] = (fourcc >> 16) & 0xFF;
+	buf[3] = (fourcc >> 24) & 0xFF;
+
+	return buf;
+}
+
+static const struct v4l2l_format *format_by_fourcc(int fourcc)
+{
+	unsigned int i;
+
+	for (i = 0; i < FORMATS; i++) {
+		if (formats[i].fourcc == fourcc)
+			return formats + i;
+	}
+
+	dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF,
+		(fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF,
+		(fourcc >> 24) & 0xFF);
+	return NULL;
+}
+
+static void pix_format_set_size(struct v4l2_pix_format *f,
+				const struct v4l2l_format *fmt,
+				unsigned int width, unsigned int height)
+{
+	f->width = width;
+	f->height = height;
+
+	if (fmt->flags & FORMAT_FLAGS_PLANAR) {
+		f->bytesperline = width; /* Y plane */
+		f->sizeimage = (width * height * fmt->depth) >> 3;
+	} else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) {
+		/* doesn't make sense for compressed formats */
+		f->bytesperline = 0;
+		f->sizeimage = (width * height * fmt->depth) >> 3;
+	} else {
+		f->bytesperline = (width * fmt->depth) >> 3;
+		f->sizeimage = height * f->bytesperline;
+	}
+}
+
+static int v4l2l_fill_format(struct v4l2_format *fmt, int capture,
+			     const u32 minwidth, const u32 maxwidth,
+			     const u32 minheight, const u32 maxheight)
+{
+	u32 width = fmt->fmt.pix.width, height = fmt->fmt.pix.height;
+	u32 pixelformat = fmt->fmt.pix.pixelformat;
+	struct v4l2_format fmt0 = *fmt;
+	u32 bytesperline = 0, sizeimage = 0;
+	if (!width)
+		width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
+	if (!height)
+		height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
+	if (width < minwidth)
+		width = minwidth;
+	if (width > maxwidth)
+		width = maxwidth;
+	if (height < minheight)
+		height = minheight;
+	if (height > maxheight)
+		height = maxheight;
+
+	/* sets: width,height,pixelformat,bytesperline,sizeimage */
+	if (!(V4L2_TYPE_IS_MULTIPLANAR(fmt0.type))) {
+		fmt0.fmt.pix.bytesperline = 0;
+		fmt0.fmt.pix.sizeimage = 0;
+	}
+
+	if (0) {
+		;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
+	} else if (!v4l2_fill_pixfmt(&fmt0.fmt.pix, pixelformat, width,
+				     height)) {
+		;
+	} else if (!v4l2_fill_pixfmt_mp(&fmt0.fmt.pix_mp, pixelformat, width,
+					height)) {
+		;
+#endif
+	} else {
+		const struct v4l2l_format *format =
+			format_by_fourcc(pixelformat);
+		if (!format)
+			return -EINVAL;
+		pix_format_set_size(&fmt0.fmt.pix, format, width, height);
+		fmt0.fmt.pix.pixelformat = format->fourcc;
+	}
+
+	if (V4L2_TYPE_IS_MULTIPLANAR(fmt0.type)) {
+		*fmt = fmt0;
+
+		if ((fmt->fmt.pix_mp.colorspace == V4L2_COLORSPACE_DEFAULT) ||
+		    (fmt->fmt.pix_mp.colorspace > V4L2_COLORSPACE_DCI_P3))
+			fmt->fmt.pix_mp.colorspace = V4L2_COLORSPACE_SRGB;
+		if (V4L2_FIELD_ANY == fmt->fmt.pix_mp.field)
+			fmt->fmt.pix_mp.field = V4L2_FIELD_NONE;
+		if (capture)
+			fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+		else
+			fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+	} else {
+		bytesperline = fmt->fmt.pix.bytesperline;
+		sizeimage = fmt->fmt.pix.sizeimage;
+
+		*fmt = fmt0;
+
+		if (!fmt->fmt.pix.bytesperline)
+			fmt->fmt.pix.bytesperline = bytesperline;
+		if (!fmt->fmt.pix.sizeimage)
+			fmt->fmt.pix.sizeimage = sizeimage;
+
+		if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) ||
+		    (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3))
+			fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
+		if (V4L2_FIELD_ANY == fmt->fmt.pix.field)
+			fmt->fmt.pix.field = V4L2_FIELD_NONE;
+		if (capture)
+			fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+		else
+			fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+	}
+
+	return 0;
+}
+
+/* Checks if v4l2l_fill_format() has set a valid, fixed sizeimage val. */
+static bool v4l2l_pix_format_has_valid_sizeimage(struct v4l2_format *fmt)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
+	const struct v4l2_format_info *info;
+
+	info = v4l2_format_info(fmt->fmt.pix.pixelformat);
+	if (info && info->mem_planes == 1)
+		return true;
+#endif
+
+	return false;
+}
+
+static int pix_format_eq(const struct v4l2_pix_format *ref,
+			 const struct v4l2_pix_format *tgt, int strict)
+{
+	/* check if the two formats are equivalent.
+	 * ANY fields are handled gracefully
+	 */
+#define _pix_format_eq0(x)    \
+	if (ref->x != tgt->x) \
+	result = 0
+#define _pix_format_eq1(x, def)                              \
+	do {                                                 \
+		if ((def != tgt->x) && (ref->x != tgt->x)) { \
+			printk(KERN_INFO #x " failed");      \
+			result = 0;                          \
+		}                                            \
+	} while (0)
+	int result = 1;
+	_pix_format_eq0(width);
+	_pix_format_eq0(height);
+	_pix_format_eq0(pixelformat);
+	if (!strict)
+		return result;
+	_pix_format_eq1(field, V4L2_FIELD_ANY);
+	_pix_format_eq0(bytesperline);
+	_pix_format_eq0(sizeimage);
+	_pix_format_eq1(colorspace, V4L2_COLORSPACE_DEFAULT);
+	return result;
+}
+
+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f);
+static int inner_try_setfmt(struct file *file, struct v4l2_format *fmt)
+{
+	int capture = V4L2_TYPE_IS_CAPTURE(fmt->type);
+	struct v4l2_loopback_device *dev;
+	int needschange = 0;
+	char buf[5];
+	buf[4] = 0;
+
+	dev = v4l2loopback_getdevice(file);
+
+	needschange = !(pix_format_eq(&dev->pix_format, &fmt->fmt.pix, 0));
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		fmt->fmt.pix = dev->pix_format;
+		if (needschange) {
+			if (dev->active_readers > 0 && capture) {
+				/* cannot call fmt_cap while there are readers */
+				return -EBUSY;
+			}
+			if (dev->ready_for_capture > 0 && !capture) {
+				/* cannot call fmt_out while there are writers */
+				return -EBUSY;
+			}
+		}
+	}
+	if (v4l2l_fill_format(fmt, capture, dev->min_width, dev->max_width,
+			      dev->min_height, dev->max_height) != 0) {
+		return -EINVAL;
+	}
+
+	if (1) {
+		char buf[5];
+		buf[4] = 0;
+		dprintk("capFOURCC=%s\n",
+			fourcc2str(dev->pix_format.pixelformat, buf));
+	}
+	return 0;
+}
+
+static int set_timeperframe(struct v4l2_loopback_device *dev,
+			    struct v4l2_fract *tpf)
+{
+	if ((tpf->denominator < 1) || (tpf->numerator < 1)) {
+		return -EINVAL;
+	}
+	dev->capture_param.timeperframe = *tpf;
+	dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator /
+					      tpf->denominator);
+	return 0;
+}
+
+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd);
+
+/* device attributes */
+/* available via sysfs: /sys/devices/virtual/video4linux/video* */
+
+static ssize_t attr_show_format(struct device *cd,
+				struct device_attribute *attr, char *buf)
+{
+	/* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+	const struct v4l2_fract *tpf;
+	char buf4cc[5], buf_fps[32];
+
+	if (!dev || !V4L2LOOPBACK_IS_FIXED_FMT(dev))
+		return 0;
+	tpf = &dev->capture_param.timeperframe;
+
+	fourcc2str(dev->pix_format.pixelformat, buf4cc);
+	buf4cc[4] = 0;
+	if (tpf->numerator == 1)
+		snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator);
+	else
+		snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator,
+			 tpf->numerator);
+	return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width,
+		       dev->pix_format.height, buf_fps);
+}
+
+static ssize_t attr_store_format(struct device *cd,
+				 struct device_attribute *attr, const char *buf,
+				 size_t len)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+	int fps_num = 0, fps_den = 1;
+
+	if (!dev)
+		return -ENODEV;
+
+	/* only fps changing is supported */
+	if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) {
+		struct v4l2_fract f = { .numerator = fps_den,
+					.denominator = fps_num };
+		int err = 0;
+		if ((err = set_timeperframe(dev, &f)) < 0)
+			return err;
+		return len;
+	}
+	return -EINVAL;
+}
+
+static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format,
+		   attr_store_format);
+
+static ssize_t attr_show_buffers(struct device *cd,
+				 struct device_attribute *attr, char *buf)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->used_buffers);
+}
+
+static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL);
+
+static ssize_t attr_show_maxopeners(struct device *cd,
+				    struct device_attribute *attr, char *buf)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->max_openers);
+}
+
+static ssize_t attr_store_maxopeners(struct device *cd,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct v4l2_loopback_device *dev = NULL;
+	unsigned long curr = 0;
+
+	if (kstrtoul(buf, 0, &curr))
+		return -EINVAL;
+
+	dev = v4l2loopback_cd2dev(cd);
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->max_openers == curr)
+		return len;
+
+	if (curr > __INT_MAX__ || dev->open_count.counter > curr) {
+		/* request to limit to less openers as are currently attached to us */
+		return -EINVAL;
+	}
+
+	dev->max_openers = (int)curr;
+
+	return len;
+}
+
+static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners,
+		   attr_store_maxopeners);
+
+static ssize_t attr_show_state(struct device *cd, struct device_attribute *attr,
+			       char *buf)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->ready_for_capture)
+		return sprintf(buf, "capture\n");
+	if (dev->ready_for_output)
+		return sprintf(buf, "output\n");
+
+	return -EAGAIN;
+}
+
+static DEVICE_ATTR(state, S_IRUGO, attr_show_state, NULL);
+
+static void v4l2loopback_remove_sysfs(struct video_device *vdev)
+{
+#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x)
+
+	if (vdev) {
+		V4L2_SYSFS_DESTROY(format);
+		V4L2_SYSFS_DESTROY(buffers);
+		V4L2_SYSFS_DESTROY(max_openers);
+		V4L2_SYSFS_DESTROY(state);
+		/* ... */
+	}
+}
+
+static void v4l2loopback_create_sysfs(struct video_device *vdev)
+{
+	int res = 0;
+
+#define V4L2_SYSFS_CREATE(x)                                 \
+	res = device_create_file(&vdev->dev, &dev_attr_##x); \
+	if (res < 0)                                         \
+	break
+	if (!vdev)
+		return;
+	do {
+		V4L2_SYSFS_CREATE(format);
+		V4L2_SYSFS_CREATE(buffers);
+		V4L2_SYSFS_CREATE(max_openers);
+		V4L2_SYSFS_CREATE(state);
+		/* ... */
+	} while (0);
+
+	if (res >= 0)
+		return;
+	dev_err(&vdev->dev, "%s error: %d\n", __func__, res);
+}
+
+/* Event APIs */
+
+#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START)
+#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000
+#define V4L2_EVENT_PRI_CLIENT_USAGE \
+	(V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1)
+
+struct v4l2_event_client_usage {
+	__u32 count;
+};
+
+/* global module data */
+/* find a device based on it's device-number (e.g. '3' for /dev/video3) */
+struct v4l2loopback_lookup_cb_data {
+	int device_nr;
+	struct v4l2_loopback_device *device;
+};
+static int v4l2loopback_lookup_cb(int id, void *ptr, void *data)
+{
+	struct v4l2_loopback_device *device = ptr;
+	struct v4l2loopback_lookup_cb_data *cbdata = data;
+	if (cbdata && device && device->vdev) {
+		if (device->vdev->num == cbdata->device_nr) {
+			cbdata->device = device;
+			cbdata->device_nr = id;
+			return 1;
+		}
+	}
+	return 0;
+}
+static int v4l2loopback_lookup(int device_nr,
+			       struct v4l2_loopback_device **device)
+{
+	struct v4l2loopback_lookup_cb_data data = {
+		.device_nr = device_nr,
+		.device = NULL,
+	};
+	int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb,
+			       &data);
+	if (1 == err) {
+		if (device)
+			*device = data.device;
+		return data.device_nr;
+	}
+	return -ENODEV;
+}
+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd)
+{
+	struct video_device *loopdev = to_video_device(cd);
+	struct v4l2loopback_private *ptr =
+		(struct v4l2loopback_private *)video_get_drvdata(loopdev);
+	int nr = ptr->device_nr;
+
+	return idr_find(&v4l2loopback_index_idr, nr);
+}
+
+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f)
+{
+	struct v4l2loopback_private *ptr = video_drvdata(f);
+	int nr = ptr->device_nr;
+
+	return idr_find(&v4l2loopback_index_idr, nr);
+}
+
+/* forward declarations */
+static void client_usage_queue_event(struct video_device *vdev);
+static void init_buffers(struct v4l2_loopback_device *dev);
+static int allocate_buffers(struct v4l2_loopback_device *dev);
+static void free_buffers(struct v4l2_loopback_device *dev);
+static void try_free_buffers(struct v4l2_loopback_device *dev);
+static int allocate_timeout_image(struct v4l2_loopback_device *dev);
+static void check_timers(struct v4l2_loopback_device *dev);
+static const struct v4l2_file_operations v4l2_loopback_fops;
+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops;
+
+/* Queue helpers */
+/* next functions sets buffer flags and adjusts counters accordingly */
+static inline void set_done(struct v4l2l_buffer *buffer)
+{
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
+	buffer->buffer.flags |= V4L2_BUF_FLAG_DONE;
+}
+
+static inline void set_queued(struct v4l2l_buffer *buffer)
+{
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
+	buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED;
+}
+
+static inline void unset_flags(struct v4l2l_buffer *buffer)
+{
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
+}
+
+/* V4L2 ioctl caps and params calls */
+/* returns device capabilities
+ * called on VIDIOC_QUERYCAP
+ */
+static int vidioc_querycap(struct file *file, void *priv,
+			   struct v4l2_capability *cap)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	int device_nr =
+		((struct v4l2loopback_private *)video_get_drvdata(dev->vdev))
+			->device_nr;
+	__u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE;
+
+	strscpy(cap->driver, "v4l2 loopback", sizeof(cap->driver));
+	snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label);
+	snprintf(cap->bus_info, sizeof(cap->bus_info),
+		 "platform:v4l2loopback-%03d", device_nr);
+
+	if (dev->announce_all_caps) {
+		capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
+	} else {
+		if (dev->ready_for_capture) {
+			capabilities |= V4L2_CAP_VIDEO_CAPTURE;
+		}
+		if (dev->ready_for_output) {
+			capabilities |= V4L2_CAP_VIDEO_OUTPUT;
+		}
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
+	dev->vdev->device_caps =
+#endif /* >=linux-4.7.0 */
+		cap->device_caps = cap->capabilities = capabilities;
+
+	cap->capabilities |= V4L2_CAP_DEVICE_CAPS;
+
+	memset(cap->reserved, 0, sizeof(cap->reserved));
+	return 0;
+}
+
+static int vidioc_enum_framesizes(struct file *file, void *fh,
+				  struct v4l2_frmsizeenum *argp)
+{
+	struct v4l2_loopback_device *dev;
+
+	/* there can be only one... */
+	if (argp->index)
+		return -EINVAL;
+
+	dev = v4l2loopback_getdevice(file);
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		/* format has already been negotiated
+		 * cannot change during runtime
+		 */
+		if (argp->pixel_format != dev->pix_format.pixelformat)
+			return -EINVAL;
+
+		argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
+
+		argp->discrete.width = dev->pix_format.width;
+		argp->discrete.height = dev->pix_format.height;
+	} else {
+		/* if the format has not been negotiated yet, we accept anything
+		 */
+		if (NULL == format_by_fourcc(argp->pixel_format))
+			return -EINVAL;
+
+		if (dev->min_width == dev->max_width &&
+		    dev->min_height == dev->max_height) {
+			argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
+
+			argp->discrete.width = dev->min_width;
+			argp->discrete.height = dev->min_height;
+		} else {
+			argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
+
+			argp->stepwise.min_width = dev->min_width;
+			argp->stepwise.min_height = dev->min_height;
+
+			argp->stepwise.max_width = dev->max_width;
+			argp->stepwise.max_height = dev->max_height;
+
+			argp->stepwise.step_width = 1;
+			argp->stepwise.step_height = 1;
+		}
+	}
+	return 0;
+}
+
+/* returns frameinterval (fps) for the set resolution
+ * called on VIDIOC_ENUM_FRAMEINTERVALS
+ */
+static int vidioc_enum_frameintervals(struct file *file, void *fh,
+				      struct v4l2_frmivalenum *argp)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+
+	/* there can be only one... */
+	if (argp->index)
+		return -EINVAL;
+
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		if (argp->width != dev->pix_format.width ||
+		    argp->height != dev->pix_format.height ||
+		    argp->pixel_format != dev->pix_format.pixelformat)
+			return -EINVAL;
+
+		argp->type = V4L2_FRMIVAL_TYPE_DISCRETE;
+		argp->discrete = dev->capture_param.timeperframe;
+	} else {
+		if (argp->width < dev->min_width ||
+		    argp->width > dev->max_width ||
+		    argp->height < dev->min_height ||
+		    argp->height > dev->max_height ||
+		    NULL == format_by_fourcc(argp->pixel_format))
+			return -EINVAL;
+
+		argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS;
+		argp->stepwise.min.numerator = 1;
+		argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX;
+		argp->stepwise.max.numerator = 1;
+		argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN;
+		argp->stepwise.step.numerator = 1;
+		argp->stepwise.step.denominator = 1;
+	}
+
+	return 0;
+}
+
+/* ------------------ CAPTURE ----------------------- */
+
+/* returns device formats
+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_enum_fmt_cap(struct file *file, void *fh,
+			       struct v4l2_fmtdesc *f)
+{
+	struct v4l2_loopback_device *dev;
+	const struct v4l2l_format *fmt;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+
+	if (f->index)
+		return -EINVAL;
+
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		/* format has been fixed, so only one single format is supported */
+		const __u32 format = dev->pix_format.pixelformat;
+
+		if ((fmt = format_by_fourcc(format))) {
+			snprintf(f->description, sizeof(f->description), "%s",
+				 fmt->name);
+		} else {
+			snprintf(f->description, sizeof(f->description),
+				 "[%c%c%c%c]", (format >> 0) & 0xFF,
+				 (format >> 8) & 0xFF, (format >> 16) & 0xFF,
+				 (format >> 24) & 0xFF);
+		}
+
+		f->pixelformat = dev->pix_format.pixelformat;
+	} else {
+		return -EINVAL;
+	}
+	f->flags = 0;
+	MARK();
+	return 0;
+}
+
+/* returns current video format
+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_g_fmt_cap(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	if (!dev->ready_for_capture && !dev->ready_for_output)
+		return -EINVAL;
+
+	fmt->fmt.pix = dev->pix_format;
+	MARK();
+	return 0;
+}
+
+/* checks if it is OK to change to format fmt;
+ * actual check is done by inner_try_setfmt
+ * just checking that pixelformat is OK and set other parameters, app should
+ * obey this decision
+ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_try_fmt_cap(struct file *file, void *priv,
+			      struct v4l2_format *fmt)
+{
+	int ret = 0;
+	if (!V4L2_TYPE_IS_CAPTURE(fmt->type))
+		return -EINVAL;
+	ret = inner_try_setfmt(file, fmt);
+	if (-EBUSY == ret)
+		return 0;
+	return ret;
+}
+
+/* sets new output format, if possible
+ * actually format is set  by input and we even do not check it, just return
+ * current one, but it is possible to set subregions of input TODO(vasaka)
+ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_s_fmt_cap(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	int ret;
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!V4L2_TYPE_IS_CAPTURE(fmt->type))
+		return -EINVAL;
+	ret = inner_try_setfmt(file, fmt);
+	if (!ret) {
+		dev->pix_format = fmt->fmt.pix;
+	}
+	return ret;
+}
+
+/* ------------------ OUTPUT ----------------------- */
+
+/* returns device formats;
+ * LATER: allow all formats
+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_enum_fmt_out(struct file *file, void *fh,
+			       struct v4l2_fmtdesc *f)
+{
+	struct v4l2_loopback_device *dev;
+	const struct v4l2l_format *fmt;
+
+	dev = v4l2loopback_getdevice(file);
+
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		/* format has been fixed, so only one single format is supported */
+		const __u32 format = dev->pix_format.pixelformat;
+
+		if (f->index)
+			return -EINVAL;
+
+		if ((fmt = format_by_fourcc(format))) {
+			snprintf(f->description, sizeof(f->description), "%s",
+				 fmt->name);
+		} else {
+			snprintf(f->description, sizeof(f->description),
+				 "[%c%c%c%c]", (format >> 0) & 0xFF,
+				 (format >> 8) & 0xFF, (format >> 16) & 0xFF,
+				 (format >> 24) & 0xFF);
+		}
+
+		f->pixelformat = dev->pix_format.pixelformat;
+	} else {
+		/* fill in a dummy format */
+		/* coverity[unsigned_compare] */
+		if (f->index < 0 || f->index >= FORMATS)
+			return -EINVAL;
+
+		fmt = &formats[f->index];
+
+		f->pixelformat = fmt->fourcc;
+		snprintf(f->description, sizeof(f->description), "%s",
+			 fmt->name);
+	}
+	f->flags = 0;
+
+	return 0;
+}
+
+/* returns current video format format fmt */
+/* NOTE: this is called from the producer
+ * so if format has not been negotiated yet,
+ * it should return ALL of available formats,
+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_g_fmt_out(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+
+	/*
+	 * LATER: this should return the currently valid format
+	 * gstreamer doesn't like it, if this returns -EINVAL, as it
+	 * then concludes that there is _no_ valid format
+	 * CHECK whether this assumption is wrong,
+	 * or whether we have to always provide a valid format
+	 */
+
+	fmt->fmt.pix = dev->pix_format;
+	return 0;
+}
+
+/* checks if it is OK to change to format fmt;
+ * if format is negotiated do not change it
+ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_try_fmt_out(struct file *file, void *priv,
+			      struct v4l2_format *fmt)
+{
+	int ret = 0;
+	if (!V4L2_TYPE_IS_OUTPUT(fmt->type))
+		return -EINVAL;
+	ret = inner_try_setfmt(file, fmt);
+	if (-EBUSY == ret)
+		return 0;
+	return ret;
+}
+
+/* sets new output format, if possible;
+ * allocate data here because we do not know if it will be streaming or
+ * read/write IO
+ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_s_fmt_out(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	struct v4l2_loopback_device *dev;
+	int ret;
+	char buf[5];
+	buf[4] = 0;
+	if (!V4L2_TYPE_IS_OUTPUT(fmt->type))
+		return -EINVAL;
+	dev = v4l2loopback_getdevice(file);
+
+	ret = inner_try_setfmt(file, fmt);
+	if (!ret) {
+		dev->pix_format = fmt->fmt.pix;
+		dev->pix_format_has_valid_sizeimage =
+			v4l2l_pix_format_has_valid_sizeimage(fmt);
+		dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture,
+			dev->pix_format.sizeimage);
+		dprintk("outFOURCC=%s\n",
+			fourcc2str(dev->pix_format.pixelformat, buf));
+
+		if (!dev->ready_for_capture) {
+			dev->buffer_size =
+				PAGE_ALIGN(dev->pix_format.sizeimage);
+			// JMZ: TODO get rid of the next line
+			fmt->fmt.pix.sizeimage = dev->buffer_size;
+			ret = allocate_buffers(dev);
+		}
+	}
+	return ret;
+}
+
+// #define V4L2L_OVERLAY
+#ifdef V4L2L_OVERLAY
+/* ------------------ OVERLAY ----------------------- */
+/* currently unsupported */
+/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work
+ * while it should only require it, if overlay is requested
+ * once the gstreamer element is fixed, remove the overlay dummies
+ */
+#warning OVERLAY dummies
+static int vidioc_g_fmt_overlay(struct file *file, void *priv,
+				struct v4l2_format *fmt)
+{
+	return 0;
+}
+
+static int vidioc_s_fmt_overlay(struct file *file, void *priv,
+				struct v4l2_format *fmt)
+{
+	return 0;
+}
+#endif /* V4L2L_OVERLAY */
+
+/* ------------------ PARAMs ----------------------- */
+
+/* get some data flow parameters, only capability, fps and readbuffers has
+ * effect on this driver
+ * called on VIDIOC_G_PARM
+ */
+static int vidioc_g_parm(struct file *file, void *priv,
+			 struct v4l2_streamparm *parm)
+{
+	/* do not care about type of opener, hope these enums would always be
+	 * compatible */
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	parm->parm.capture = dev->capture_param;
+	return 0;
+}
+
+/* get some data flow parameters, only capability, fps and readbuffers has
+ * effect on this driver
+ * called on VIDIOC_S_PARM
+ */
+static int vidioc_s_parm(struct file *file, void *priv,
+			 struct v4l2_streamparm *parm)
+{
+	struct v4l2_loopback_device *dev;
+	int err = 0;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	dprintk("vidioc_s_parm called frate=%d/%d\n",
+		parm->parm.capture.timeperframe.numerator,
+		parm->parm.capture.timeperframe.denominator);
+
+	switch (parm->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if ((err = set_timeperframe(
+			     dev, &parm->parm.capture.timeperframe)) < 0)
+			return err;
+		break;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if ((err = set_timeperframe(
+			     dev, &parm->parm.capture.timeperframe)) < 0)
+			return err;
+		break;
+	default:
+		return -1;
+	}
+
+	parm->parm.capture = dev->capture_param;
+	return 0;
+}
+
+#ifdef V4L2LOOPBACK_WITH_STD
+/* sets a tv standard, actually we do not need to handle this any special way
+ * added to support effecttv
+ * called on VIDIOC_S_STD
+ */
+static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std)
+{
+	v4l2_std_id req_std = 0, supported_std = 0;
+	const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0;
+
+	if (_std) {
+		req_std = *_std;
+		*_std = all_std;
+	}
+
+	/* we support everything in V4L2_STD_ALL, but not more... */
+	supported_std = (all_std & req_std);
+	if (no_std == supported_std)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* gets a fake video standard
+ * called on VIDIOC_G_STD
+ */
+static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
+{
+	if (norm)
+		*norm = V4L2_STD_ALL;
+	return 0;
+}
+/* gets a fake video standard
+ * called on VIDIOC_QUERYSTD
+ */
+static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm)
+{
+	if (norm)
+		*norm = V4L2_STD_ALL;
+	return 0;
+}
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id,
+				 s64 val)
+{
+	switch (id) {
+	case CID_KEEP_FORMAT:
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		dev->keep_format = val;
+		try_free_buffers(
+			dev); /* will only free buffers if !keep_format */
+		break;
+	case CID_SUSTAIN_FRAMERATE:
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		spin_lock_bh(&dev->lock);
+		dev->sustain_framerate = val;
+		check_timers(dev);
+		spin_unlock_bh(&dev->lock);
+		break;
+	case CID_TIMEOUT:
+		if (val < 0 || val > MAX_TIMEOUT)
+			return -EINVAL;
+		spin_lock_bh(&dev->lock);
+		dev->timeout_jiffies = msecs_to_jiffies(val);
+		check_timers(dev);
+		spin_unlock_bh(&dev->lock);
+		allocate_timeout_image(dev);
+		break;
+	case CID_TIMEOUT_IMAGE_IO:
+		dev->timeout_image_io = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct v4l2_loopback_device *dev = container_of(
+		ctrl->handler, struct v4l2_loopback_device, ctrl_handler);
+	return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val);
+}
+
+/* returns set of device outputs, in our case there is only one
+ * called on VIDIOC_ENUMOUTPUT
+ */
+static int vidioc_enum_output(struct file *file, void *fh,
+			      struct v4l2_output *outp)
+{
+	__u32 index = outp->index;
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	MARK();
+
+	if (!dev->announce_all_caps && !dev->ready_for_output)
+		return -ENOTTY;
+
+	if (0 != index)
+		return -EINVAL;
+
+	/* clear all data (including the reserved fields) */
+	memset(outp, 0, sizeof(*outp));
+
+	outp->index = index;
+	strscpy(outp->name, "loopback in", sizeof(outp->name));
+	outp->type = V4L2_OUTPUT_TYPE_ANALOG;
+	outp->audioset = 0;
+	outp->modulator = 0;
+#ifdef V4L2LOOPBACK_WITH_STD
+	outp->std = V4L2_STD_ALL;
+#ifdef V4L2_OUT_CAP_STD
+	outp->capabilities |= V4L2_OUT_CAP_STD;
+#endif /*  V4L2_OUT_CAP_STD */
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	return 0;
+}
+
+/* which output is currently active,
+ * called on VIDIOC_G_OUTPUT
+ */
+static int vidioc_g_output(struct file *file, void *fh, unsigned int *i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_output)
+		return -ENOTTY;
+	if (i)
+		*i = 0;
+	return 0;
+}
+
+/* set output, can make sense if we have more than one video src,
+ * called on VIDIOC_S_OUTPUT
+ */
+static int vidioc_s_output(struct file *file, void *fh, unsigned int i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_output)
+		return -ENOTTY;
+
+	if (i)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* returns set of device inputs, in our case there is only one,
+ * but later I may add more
+ * called on VIDIOC_ENUMINPUT
+ */
+static int vidioc_enum_input(struct file *file, void *fh,
+			     struct v4l2_input *inp)
+{
+	struct v4l2_loopback_device *dev;
+	__u32 index = inp->index;
+	MARK();
+
+	if (0 != index)
+		return -EINVAL;
+
+	/* clear all data (including the reserved fields) */
+	memset(inp, 0, sizeof(*inp));
+
+	inp->index = index;
+	strscpy(inp->name, "loopback", sizeof(inp->name));
+	inp->type = V4L2_INPUT_TYPE_CAMERA;
+	inp->audioset = 0;
+	inp->tuner = 0;
+	inp->status = 0;
+
+#ifdef V4L2LOOPBACK_WITH_STD
+	inp->std = V4L2_STD_ALL;
+#ifdef V4L2_IN_CAP_STD
+	inp->capabilities |= V4L2_IN_CAP_STD;
+#endif
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	dev = v4l2loopback_getdevice(file);
+	if (!dev->ready_for_capture) {
+		inp->status |= V4L2_IN_ST_NO_SIGNAL;
+	}
+
+	return 0;
+}
+
+/* which input is currently active,
+ * called on VIDIOC_G_INPUT
+ */
+static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_capture)
+		return -ENOTTY;
+	if (i)
+		*i = 0;
+	return 0;
+}
+
+/* set input, can make sense if we have more than one video src,
+ * called on VIDIOC_S_INPUT
+ */
+static int vidioc_s_input(struct file *file, void *fh, unsigned int i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_capture)
+		return -ENOTTY;
+	if (i == 0)
+		return 0;
+	return -EINVAL;
+}
+
+/* --------------- V4L2 ioctl buffer related calls ----------------- */
+
+/* negotiate buffer type
+ * only mmap streaming supported
+ * called on VIDIOC_REQBUFS
+ */
+static int vidioc_reqbufs(struct file *file, void *fh,
+			  struct v4l2_requestbuffers *b)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	int i;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count,
+		dev->buffers_number);
+
+	if (opener->timeout_image_io) {
+		dev->timeout_image_io = 0;
+		if (b->memory != V4L2_MEMORY_MMAP)
+			return -EINVAL;
+		b->count = 2;
+		return 0;
+	}
+
+	if (V4L2_TYPE_IS_OUTPUT(b->type) && (!dev->ready_for_output)) {
+		return -EBUSY;
+	}
+
+	init_buffers(dev);
+	switch (b->memory) {
+	case V4L2_MEMORY_MMAP:
+		/* do nothing here, buffers are always allocated */
+		if (b->count < 1 || dev->buffers_number < 1)
+			return 0;
+
+		if (b->count > dev->buffers_number)
+			b->count = dev->buffers_number;
+
+		/* make sure that outbufs_list contains buffers from 0 to used_buffers-1
+		 * actually, it will have been already populated via v4l2_loopback_init()
+		 * at this point */
+		if (list_empty(&dev->outbufs_list)) {
+			for (i = 0; i < dev->used_buffers; ++i)
+				list_add_tail(&dev->buffers[i].list_head,
+					      &dev->outbufs_list);
+		}
+
+		/* also, if dev->used_buffers is going to be decreased, we should remove
+		 * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */
+		if (b->count < dev->used_buffers) {
+			struct v4l2l_buffer *pos, *n;
+
+			list_for_each_entry_safe(pos, n, &dev->outbufs_list,
+						 list_head) {
+				if (pos->buffer.index >= b->count)
+					list_del(&pos->list_head);
+			}
+
+			/* after we update dev->used_buffers, buffers in outbufs_list will
+			 * correspond to dev->write_position + [0;b->count-1] range */
+			i = v4l2l_mod64(dev->write_position, b->count);
+			list_for_each_entry(pos, &dev->outbufs_list,
+					    list_head) {
+				dev->bufpos2index[i % b->count] =
+					pos->buffer.index;
+				++i;
+			}
+		}
+
+		opener->buffers_number = b->count;
+		if (opener->buffers_number < dev->used_buffers)
+			dev->used_buffers = opener->buffers_number;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* returns buffer asked for;
+ * give app as many buffers as it wants, if it less than MAX,
+ * but map them in our inner buffers
+ * called on VIDIOC_QUERYBUF
+ */
+static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b)
+{
+	enum v4l2_buf_type type;
+	int index;
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+
+	MARK();
+
+	type = b->type;
+	index = b->index;
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) &&
+	    (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) {
+		return -EINVAL;
+	}
+	if (b->index > max_buffers)
+		return -EINVAL;
+
+	if (opener->timeout_image_io)
+		*b = dev->timeout_image_buffer.buffer;
+	else
+		*b = dev->buffers[b->index % dev->used_buffers].buffer;
+
+	b->type = type;
+	b->index = index;
+	dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory,
+		  dev->buffers_number, dev->buffer_size);
+
+	/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
+            https://github.com/umlaeute/v4l2loopback/issues/60 */
+	b->flags &= ~V4L2_BUF_FLAG_DONE;
+	b->flags |= V4L2_BUF_FLAG_QUEUED;
+
+	return 0;
+}
+
+static void buffer_written(struct v4l2_loopback_device *dev,
+			   struct v4l2l_buffer *buf)
+{
+	del_timer_sync(&dev->sustain_timer);
+	del_timer_sync(&dev->timeout_timer);
+
+	spin_lock_bh(&dev->list_lock);
+	list_move_tail(&buf->list_head, &dev->outbufs_list);
+	spin_unlock_bh(&dev->list_lock);
+
+	spin_lock_bh(&dev->lock);
+	dev->bufpos2index[v4l2l_mod64(dev->write_position, dev->used_buffers)] =
+		buf->buffer.index;
+	++dev->write_position;
+	dev->reread_count = 0;
+
+	check_timers(dev);
+	spin_unlock_bh(&dev->lock);
+}
+
+/* put buffer to queue
+ * called on VIDIOC_QBUF
+ */
+static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	struct v4l2l_buffer *b;
+	int index;
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	if (buf->index > max_buffers)
+		return -EINVAL;
+	if (opener->timeout_image_io)
+		return 0;
+
+	index = buf->index % dev->used_buffers;
+	b = &dev->buffers[index];
+
+	switch (buf->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		dprintkrw(
+			"qbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		set_queued(b);
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		dprintkrw(
+			"qbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		if ((!(b->buffer.flags & V4L2_BUF_FLAG_TIMESTAMP_COPY)) &&
+		    (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0))
+			v4l2l_get_timestamp(&b->buffer);
+		else {
+			b->buffer.timestamp = buf->timestamp;
+			b->buffer.flags |= V4L2_BUF_FLAG_TIMESTAMP_COPY;
+		}
+		if (dev->pix_format_has_valid_sizeimage) {
+			if (buf->bytesused >= dev->pix_format.sizeimage) {
+				b->buffer.bytesused = dev->pix_format.sizeimage;
+			} else {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+				dev_warn_ratelimited(
+					&dev->vdev->dev,
+#else
+				dprintkrw(
+#endif
+					"warning queued output buffer bytesused too small %d < %d\n",
+					buf->bytesused,
+					dev->pix_format.sizeimage);
+				b->buffer.bytesused = buf->bytesused;
+			}
+		} else {
+			b->buffer.bytesused = buf->bytesused;
+		}
+
+		set_done(b);
+		buffer_written(dev, b);
+
+		/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
+                    https://github.com/umlaeute/v4l2loopback/issues/60 */
+		buf->flags &= ~V4L2_BUF_FLAG_DONE;
+		buf->flags |= V4L2_BUF_FLAG_QUEUED;
+
+		wake_up_all(&dev->read_event);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int can_read(struct v4l2_loopback_device *dev,
+		    struct v4l2_loopback_opener *opener)
+{
+	int ret;
+
+	spin_lock_bh(&dev->lock);
+	check_timers(dev);
+	ret = dev->write_position > opener->read_position ||
+	      dev->reread_count > opener->reread_count || dev->timeout_happened;
+	spin_unlock_bh(&dev->lock);
+	return ret;
+}
+
+static int get_capture_buffer(struct file *file)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
+	int pos, ret;
+	int timeout_happened;
+
+	if ((file->f_flags & O_NONBLOCK) &&
+	    (dev->write_position <= opener->read_position &&
+	     dev->reread_count <= opener->reread_count &&
+	     !dev->timeout_happened))
+		return -EAGAIN;
+	wait_event_interruptible(dev->read_event, can_read(dev, opener));
+
+	spin_lock_bh(&dev->lock);
+	if (dev->write_position == opener->read_position) {
+		if (dev->reread_count > opener->reread_count + 2)
+			opener->reread_count = dev->reread_count - 1;
+		++opener->reread_count;
+		pos = v4l2l_mod64(opener->read_position + dev->used_buffers - 1,
+				  dev->used_buffers);
+	} else {
+		opener->reread_count = 0;
+		if (dev->write_position >
+		    opener->read_position + dev->used_buffers)
+			opener->read_position = dev->write_position - 1;
+		pos = v4l2l_mod64(opener->read_position, dev->used_buffers);
+		++opener->read_position;
+	}
+	timeout_happened = dev->timeout_happened;
+	dev->timeout_happened = 0;
+	spin_unlock_bh(&dev->lock);
+
+	ret = dev->bufpos2index[pos];
+	if (timeout_happened) {
+		if (ret < 0) {
+			dprintk("trying to return not mapped buf[%d]\n", ret);
+			return -EFAULT;
+		}
+		/* although allocated on-demand, timeout_image is freed only
+		 * in free_buffers(), so we don't need to worry about it being
+		 * deallocated suddenly */
+		memcpy(dev->image + dev->buffers[ret].buffer.m.offset,
+		       dev->timeout_image, dev->buffer_size);
+	}
+	return ret;
+}
+
+/* put buffer to dequeue
+ * called on VIDIOC_DQBUF
+ */
+static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	int index;
+	struct v4l2l_buffer *b;
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+	if (opener->timeout_image_io) {
+		*buf = dev->timeout_image_buffer.buffer;
+		return 0;
+	}
+
+	switch (buf->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		index = get_capture_buffer(file);
+		if (index < 0)
+			return index;
+		dprintkrw("capture DQBUF pos: %lld index: %d\n",
+			  (long long)(opener->read_position - 1), index);
+		if (!(dev->buffers[index].buffer.flags &
+		      V4L2_BUF_FLAG_MAPPED)) {
+			dprintk("trying to return not mapped buf[%d]\n", index);
+			return -EINVAL;
+		}
+		unset_flags(&dev->buffers[index]);
+		*buf = dev->buffers[index].buffer;
+		dprintkrw(
+			"dqbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		spin_lock_bh(&dev->list_lock);
+
+		b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer,
+			       list_head);
+		list_move_tail(&b->list_head, &dev->outbufs_list);
+
+		spin_unlock_bh(&dev->list_lock);
+		dprintkrw("output DQBUF index: %d\n", b->buffer.index);
+		unset_flags(b);
+		*buf = b->buffer;
+		buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+		dprintkrw(
+			"dqbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* ------------- STREAMING ------------------- */
+
+/* start streaming
+ * called on VIDIOC_STREAMON
+ */
+static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	switch (type) {
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if (!dev->ready_for_capture) {
+			int ret = allocate_buffers(dev);
+			if (ret < 0)
+				return ret;
+		}
+		opener->type = WRITER;
+		dev->ready_for_output = 0;
+		dev->ready_for_capture++;
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if (!dev->ready_for_capture)
+			return -EIO;
+		if (dev->active_readers > 0)
+			return -EBUSY;
+		opener->type = READER;
+		dev->active_readers++;
+		client_usage_queue_event(dev->vdev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+	return -EINVAL;
+}
+
+/* stop streaming
+ * called on VIDIOC_STREAMOFF
+ */
+static int vidioc_streamoff(struct file *file, void *fh,
+			    enum v4l2_buf_type type)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+
+	MARK();
+	dprintk("%d\n", type);
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+	switch (type) {
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if (dev->ready_for_capture > 0)
+			dev->ready_for_capture--;
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if (opener->type == READER) {
+			opener->type = 0;
+			dev->active_readers--;
+			client_usage_queue_event(dev->vdev);
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+	return -EINVAL;
+}
+
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p)
+{
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	p->frames = dev->buffers_number;
+	p->offsets[0] = 0;
+	p->offsets[1] = 0;
+	p->size = dev->buffer_size;
+	return 0;
+}
+#endif
+
+static void client_usage_queue_event(struct video_device *vdev)
+{
+	struct v4l2_event ev;
+	struct v4l2_loopback_device *dev;
+
+	dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device,
+			   v4l2_dev);
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = V4L2_EVENT_PRI_CLIENT_USAGE;
+	((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers;
+
+	v4l2_event_queue(vdev, &ev);
+}
+
+static int client_usage_ops_add(struct v4l2_subscribed_event *sev,
+				unsigned elems)
+{
+	if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL))
+		return 0;
+
+	client_usage_queue_event(sev->fh->vdev);
+	return 0;
+}
+
+static void client_usage_ops_replace(struct v4l2_event *old,
+				     const struct v4l2_event *new)
+{
+	*((struct v4l2_event_client_usage *)&old->u) =
+		*((struct v4l2_event_client_usage *)&new->u);
+}
+
+static void client_usage_ops_merge(const struct v4l2_event *old,
+				   struct v4l2_event *new)
+{
+	*((struct v4l2_event_client_usage *)&new->u) =
+		*((struct v4l2_event_client_usage *)&old->u);
+}
+
+const struct v4l2_subscribed_event_ops client_usage_ops = {
+	.add = client_usage_ops_add,
+	.replace = client_usage_ops_replace,
+	.merge = client_usage_ops_merge,
+};
+
+static int vidioc_subscribe_event(struct v4l2_fh *fh,
+				  const struct v4l2_event_subscription *sub)
+{
+	switch (sub->type) {
+	case V4L2_EVENT_CTRL:
+		return v4l2_ctrl_subscribe_event(fh, sub);
+	case V4L2_EVENT_PRI_CLIENT_USAGE:
+		return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops);
+	}
+
+	return -EINVAL;
+}
+
+/* file operations */
+static void vm_open(struct vm_area_struct *vma)
+{
+	struct v4l2l_buffer *buf;
+	MARK();
+
+	buf = vma->vm_private_data;
+	buf->use_count++;
+
+	buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED;
+}
+
+static void vm_close(struct vm_area_struct *vma)
+{
+	struct v4l2l_buffer *buf;
+	MARK();
+
+	buf = vma->vm_private_data;
+	buf->use_count--;
+
+	if (buf->use_count <= 0)
+		buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED;
+}
+
+static struct vm_operations_struct vm_ops = {
+	.open = vm_open,
+	.close = vm_close,
+};
+
+static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	u8 *addr;
+	unsigned long start;
+	unsigned long size;
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	struct v4l2l_buffer *buffer = NULL;
+	MARK();
+
+	start = (unsigned long)vma->vm_start;
+	size = (unsigned long)(vma->vm_end - vma->vm_start);
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(file->private_data);
+
+	if (size > dev->buffer_size) {
+		dprintk("userspace tries to mmap too much, fail\n");
+		return -EINVAL;
+	}
+	if (opener->timeout_image_io) {
+		/* we are going to map the timeout_image_buffer */
+		if ((vma->vm_pgoff << PAGE_SHIFT) !=
+		    dev->buffer_size * MAX_BUFFERS) {
+			dprintk("invalid mmap offset for timeout_image_io mode\n");
+			return -EINVAL;
+		}
+	} else if ((vma->vm_pgoff << PAGE_SHIFT) >
+		   dev->buffer_size * (dev->buffers_number - 1)) {
+		dprintk("userspace tries to mmap too far, fail\n");
+		return -EINVAL;
+	}
+
+	/* FIXXXXXME: allocation should not happen here! */
+	if (NULL == dev->image)
+		if (allocate_buffers(dev) < 0)
+			return -EINVAL;
+
+	if (opener->timeout_image_io) {
+		buffer = &dev->timeout_image_buffer;
+		addr = dev->timeout_image;
+	} else {
+		int i;
+		for (i = 0; i < dev->buffers_number; ++i) {
+			buffer = &dev->buffers[i];
+			if ((buffer->buffer.m.offset >> PAGE_SHIFT) ==
+			    vma->vm_pgoff)
+				break;
+		}
+
+		if (i >= dev->buffers_number)
+			return -EINVAL;
+
+		addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT);
+	}
+
+	while (size > 0) {
+		struct page *page;
+
+		page = vmalloc_to_page(addr);
+
+		if (vm_insert_page(vma, start, page) < 0)
+			return -EAGAIN;
+
+		start += PAGE_SIZE;
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+
+	vma->vm_ops = &vm_ops;
+	vma->vm_private_data = buffer;
+
+	vm_open(vma);
+
+	MARK();
+	return 0;
+}
+
+static unsigned int v4l2_loopback_poll(struct file *file,
+				       struct poll_table_struct *pts)
+{
+	struct v4l2_loopback_opener *opener;
+	struct v4l2_loopback_device *dev;
+	__poll_t req_events = poll_requested_events(pts);
+	int ret_mask = 0;
+	MARK();
+
+	opener = fh_to_opener(file->private_data);
+	dev = v4l2loopback_getdevice(file);
+
+	if (req_events & POLLPRI) {
+		if (!v4l2_event_pending(&opener->fh))
+			poll_wait(file, &opener->fh.wait, pts);
+		if (v4l2_event_pending(&opener->fh)) {
+			ret_mask |= POLLPRI;
+			if (!(req_events & DEFAULT_POLLMASK))
+				return ret_mask;
+		}
+	}
+
+	switch (opener->type) {
+	case WRITER:
+		ret_mask |= POLLOUT | POLLWRNORM;
+		break;
+	case READER:
+		if (!can_read(dev, opener)) {
+			if (ret_mask)
+				return ret_mask;
+			poll_wait(file, &dev->read_event, pts);
+		}
+		if (can_read(dev, opener))
+			ret_mask |= POLLIN | POLLRDNORM;
+		if (v4l2_event_pending(&opener->fh))
+			ret_mask |= POLLPRI;
+		break;
+	default:
+		break;
+	}
+
+	MARK();
+	return ret_mask;
+}
+
+/* do not want to limit device opens, it can be as many readers as user want,
+ * writers are limited by means of setting writer field */
+static int v4l2_loopback_open(struct file *file)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	MARK();
+	dev = v4l2loopback_getdevice(file);
+	if (dev->open_count.counter >= dev->max_openers)
+		return -EBUSY;
+	/* kfree on close */
+	opener = kzalloc(sizeof(*opener), GFP_KERNEL);
+	if (opener == NULL)
+		return -ENOMEM;
+
+	atomic_inc(&dev->open_count);
+
+	opener->timeout_image_io = dev->timeout_image_io;
+	if (opener->timeout_image_io) {
+		int r = allocate_timeout_image(dev);
+
+		if (r < 0) {
+			dprintk("timeout image allocation failed\n");
+
+			atomic_dec(&dev->open_count);
+
+			kfree(opener);
+			return r;
+		}
+	}
+
+	v4l2_fh_init(&opener->fh, video_devdata(file));
+	file->private_data = &opener->fh;
+
+	v4l2_fh_add(&opener->fh);
+	dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL);
+	MARK();
+	return 0;
+}
+
+static int v4l2_loopback_close(struct file *file)
+{
+	struct v4l2_loopback_opener *opener;
+	struct v4l2_loopback_device *dev;
+	int is_writer = 0, is_reader = 0;
+	MARK();
+
+	opener = fh_to_opener(file->private_data);
+	dev = v4l2loopback_getdevice(file);
+
+	if (WRITER == opener->type)
+		is_writer = 1;
+	if (READER == opener->type)
+		is_reader = 1;
+
+	atomic_dec(&dev->open_count);
+	if (dev->open_count.counter == 0) {
+		del_timer_sync(&dev->sustain_timer);
+		del_timer_sync(&dev->timeout_timer);
+	}
+	try_free_buffers(dev);
+
+	v4l2_fh_del(&opener->fh);
+	v4l2_fh_exit(&opener->fh);
+
+	kfree(opener);
+	if (is_writer)
+		dev->ready_for_output = 1;
+	if (is_reader) {
+		dev->active_readers--;
+		client_usage_queue_event(dev->vdev);
+	}
+	MARK();
+	return 0;
+}
+
+static ssize_t v4l2_loopback_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	int read_index;
+	struct v4l2_loopback_device *dev;
+	struct v4l2_buffer *b;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+
+	read_index = get_capture_buffer(file);
+	if (read_index < 0)
+		return read_index;
+	if (count > dev->buffer_size)
+		count = dev->buffer_size;
+	b = &dev->buffers[read_index].buffer;
+	if (count > b->bytesused)
+		count = b->bytesused;
+	if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset),
+			 count)) {
+		printk(KERN_ERR
+		       "v4l2-loopback: failed copy_to_user() in read buf\n");
+		return -EFAULT;
+	}
+	dprintkrw("leave v4l2_loopback_read()\n");
+	return count;
+}
+
+static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct v4l2_loopback_opener *opener;
+	struct v4l2_loopback_device *dev;
+	int write_index;
+	struct v4l2_buffer *b;
+	int err = 0;
+
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(file->private_data);
+
+	if (UNNEGOTIATED == opener->type) {
+		spin_lock(&dev->lock);
+
+		if (dev->ready_for_output) {
+			err = vidioc_streamon(file, file->private_data,
+					      V4L2_BUF_TYPE_VIDEO_OUTPUT);
+		}
+
+		spin_unlock(&dev->lock);
+
+		if (err < 0)
+			return err;
+	}
+
+	if (WRITER != opener->type)
+		return -EINVAL;
+
+	if (!dev->ready_for_capture) {
+		int ret = allocate_buffers(dev);
+		if (ret < 0)
+			return ret;
+		dev->ready_for_capture = 1;
+	}
+	dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count);
+	if (count > dev->buffer_size)
+		count = dev->buffer_size;
+
+	write_index = v4l2l_mod64(dev->write_position, dev->used_buffers);
+	b = &dev->buffers[write_index].buffer;
+
+	if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf,
+			   count)) {
+		printk(KERN_ERR
+		       "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n",
+		       count);
+		return -EFAULT;
+	}
+	v4l2l_get_timestamp(b);
+	b->bytesused = count;
+	b->sequence = dev->write_position;
+	buffer_written(dev, &dev->buffers[write_index]);
+	wake_up_all(&dev->read_event);
+	dprintkrw("leave v4l2_loopback_write()\n");
+	return count;
+}
+
+/* init functions */
+/* frees buffers, if already allocated */
+static void free_buffers(struct v4l2_loopback_device *dev)
+{
+	MARK();
+	dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev);
+	if (!dev)
+		return;
+	if (dev->image) {
+		vfree(dev->image);
+		dev->image = NULL;
+	}
+	if (dev->timeout_image) {
+		vfree(dev->timeout_image);
+		dev->timeout_image = NULL;
+	}
+	dev->imagesize = 0;
+}
+/* frees buffers, if they are no longer needed */
+static void try_free_buffers(struct v4l2_loopback_device *dev)
+{
+	MARK();
+	if (0 == dev->open_count.counter && !dev->keep_format) {
+		free_buffers(dev);
+		dev->ready_for_capture = 0;
+		dev->buffer_size = 0;
+		dev->write_position = 0;
+	}
+}
+/* allocates buffers, if buffer_size is set */
+static int allocate_buffers(struct v4l2_loopback_device *dev)
+{
+	int err;
+
+	MARK();
+	/* vfree on close file operation in case no open handles left */
+
+	if (dev->buffer_size < 1 || dev->buffers_number < 1)
+		return -EINVAL;
+
+	if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number)
+		return -ENOSPC;
+
+	if (dev->image) {
+		dprintk("allocating buffers again: %ld %ld\n",
+			dev->buffer_size * dev->buffers_number, dev->imagesize);
+		/* FIXME: prevent double allocation more intelligently! */
+		if (dev->buffer_size * dev->buffers_number == dev->imagesize)
+			return 0;
+
+		/* check whether the total number of readers/writers is <=1 */
+		if ((dev->ready_for_capture + dev->active_readers) <= 1)
+			free_buffers(dev);
+		else
+			return -EINVAL;
+	}
+
+	dev->imagesize = (unsigned long)dev->buffer_size *
+			 (unsigned long)dev->buffers_number;
+
+	dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size,
+		dev->buffers_number);
+	err = -ENOMEM;
+
+	if (dev->timeout_jiffies > 0) {
+		err = allocate_timeout_image(dev);
+		if (err < 0)
+			goto error;
+	}
+
+	dev->image = vmalloc(dev->imagesize);
+	if (dev->image == NULL)
+		goto error;
+
+	dprintk("vmallocated %ld bytes\n", dev->imagesize);
+	MARK();
+
+	init_buffers(dev);
+	return 0;
+
+error:
+	free_buffers(dev);
+	return err;
+}
+
+/* init inner buffers, they are capture mode and flags are set as
+ * for capture mod buffers */
+static void init_buffers(struct v4l2_loopback_device *dev)
+{
+	int i;
+	int buffer_size;
+	int bytesused;
+	MARK();
+
+	buffer_size = dev->buffer_size;
+	bytesused = dev->pix_format.sizeimage;
+	for (i = 0; i < dev->buffers_number; ++i) {
+		struct v4l2_buffer *b = &dev->buffers[i].buffer;
+		b->index = i;
+		b->bytesused = bytesused;
+		b->length = buffer_size;
+		b->field = V4L2_FIELD_NONE;
+		b->flags = 0;
+		b->m.offset = i * buffer_size;
+		b->memory = V4L2_MEMORY_MMAP;
+		b->sequence = 0;
+		b->timestamp.tv_sec = 0;
+		b->timestamp.tv_usec = 0;
+		b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+
+		v4l2l_get_timestamp(b);
+	}
+	dev->timeout_image_buffer = dev->buffers[0];
+	dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size;
+	MARK();
+}
+
+static int allocate_timeout_image(struct v4l2_loopback_device *dev)
+{
+	MARK();
+	if (dev->buffer_size <= 0) {
+		dev->timeout_image_io = 0;
+		return -EINVAL;
+	}
+
+	if (dev->timeout_image == NULL) {
+		dev->timeout_image = vzalloc(dev->buffer_size);
+		if (dev->timeout_image == NULL) {
+			dev->timeout_image_io = 0;
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/* fills and register video device */
+static void init_vdev(struct video_device *vdev, int nr)
+{
+	MARK();
+
+#ifdef V4L2LOOPBACK_WITH_STD
+	vdev->tvnorms = V4L2_STD_ALL;
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	vdev->vfl_type = VFL_TYPE_VIDEO;
+	vdev->fops = &v4l2_loopback_fops;
+	vdev->ioctl_ops = &v4l2_loopback_ioctl_ops;
+	vdev->release = &video_device_release;
+	vdev->minor = -1;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
+	vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE |
+			    V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE |
+			    V4L2_CAP_STREAMING;
+#endif
+
+	if (debug > 1)
+		vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL |
+				  V4L2_DEV_DEBUG_IOCTL_ARG;
+
+	vdev->vfl_dir = VFL_DIR_M2M;
+
+	MARK();
+}
+
+/* init default capture parameters, only fps may be changed in future */
+static void init_capture_param(struct v4l2_captureparm *capture_param)
+{
+	MARK();
+	capture_param->capability = 0;
+	capture_param->capturemode = 0;
+	capture_param->extendedmode = 0;
+	capture_param->readbuffers = max_buffers;
+	capture_param->timeperframe.numerator = 1;
+	capture_param->timeperframe.denominator = 30;
+}
+
+static void check_timers(struct v4l2_loopback_device *dev)
+{
+	if (!dev->ready_for_capture)
+		return;
+
+	if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer))
+		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
+	if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer))
+		mod_timer(&dev->sustain_timer,
+			  jiffies + dev->frame_jiffies * 3 / 2);
+}
+#ifdef HAVE_TIMER_SETUP
+static void sustain_timer_clb(struct timer_list *t)
+{
+	struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer);
+#else
+static void sustain_timer_clb(unsigned long nr)
+{
+	struct v4l2_loopback_device *dev =
+		idr_find(&v4l2loopback_index_idr, nr);
+#endif
+	spin_lock(&dev->lock);
+	if (dev->sustain_framerate) {
+		dev->reread_count++;
+		dprintkrw("reread: %lld %d\n", (long long)dev->write_position,
+			  dev->reread_count);
+		if (dev->reread_count == 1)
+			mod_timer(&dev->sustain_timer,
+				  jiffies + max(1UL, dev->frame_jiffies / 2));
+		else
+			mod_timer(&dev->sustain_timer,
+				  jiffies + dev->frame_jiffies);
+		wake_up_all(&dev->read_event);
+	}
+	spin_unlock(&dev->lock);
+}
+#ifdef HAVE_TIMER_SETUP
+static void timeout_timer_clb(struct timer_list *t)
+{
+	struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer);
+#else
+static void timeout_timer_clb(unsigned long nr)
+{
+	struct v4l2_loopback_device *dev =
+		idr_find(&v4l2loopback_index_idr, nr);
+#endif
+	spin_lock(&dev->lock);
+	if (dev->timeout_jiffies > 0) {
+		dev->timeout_happened = 1;
+		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
+		wake_up_all(&dev->read_event);
+	}
+	spin_unlock(&dev->lock);
+}
+
+/* init loopback main structure */
+#define DEFAULT_FROM_CONF(confmember, default_condition, default_value)        \
+	((conf) ?                                                              \
+		 ((conf->confmember default_condition) ? (default_value) :     \
+							 (conf->confmember)) : \
+		 default_value)
+
+static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_ctrl_handler *hdl;
+	struct v4l2loopback_private *vdev_priv = NULL;
+
+	int err = -ENOMEM;
+
+	u32 _width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
+	u32 _height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
+
+	u32 _min_width = DEFAULT_FROM_CONF(min_width,
+					   < V4L2LOOPBACK_SIZE_MIN_WIDTH,
+					   V4L2LOOPBACK_SIZE_MIN_WIDTH);
+	u32 _min_height = DEFAULT_FROM_CONF(min_height,
+					    < V4L2LOOPBACK_SIZE_MIN_HEIGHT,
+					    V4L2LOOPBACK_SIZE_MIN_HEIGHT);
+	u32 _max_width = DEFAULT_FROM_CONF(max_width, < _min_width, max_width);
+	u32 _max_height =
+		DEFAULT_FROM_CONF(max_height, < _min_height, max_height);
+	bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ?
+					  (conf->announce_all_caps) :
+					  V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS;
+	int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers);
+	int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers);
+
+	int nr = -1;
+
+	_announce_all_caps = (!!_announce_all_caps);
+
+	if (conf) {
+		const int output_nr = conf->output_nr;
+#ifdef SPLIT_DEVICES
+		const int capture_nr = conf->capture_nr;
+#else
+		const int capture_nr = output_nr;
+#endif
+		if (capture_nr >= 0 && output_nr == capture_nr) {
+			nr = output_nr;
+		} else if (capture_nr < 0 && output_nr < 0) {
+			nr = -1;
+		} else if (capture_nr < 0) {
+			nr = output_nr;
+		} else if (output_nr < 0) {
+			nr = capture_nr;
+		} else {
+			printk(KERN_ERR
+			       "split OUTPUT and CAPTURE devices not yet supported.");
+			printk(KERN_INFO
+			       "both devices must have the same number (%d != %d).",
+			       output_nr, capture_nr);
+			return -EINVAL;
+		}
+	}
+
+	if (idr_find(&v4l2loopback_index_idr, nr))
+		return -EEXIST;
+
+	dprintk("creating v4l2loopback-device #%d\n", nr);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (nr >= 0) {
+		err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1,
+				GFP_KERNEL);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL);
+	}
+	if (err < 0)
+		goto out_free_dev;
+	nr = err;
+	err = -ENOMEM;
+
+	if (conf && conf->card_label[0]) {
+		snprintf(dev->card_label, sizeof(dev->card_label), "%s",
+			 conf->card_label);
+	} else {
+		snprintf(dev->card_label, sizeof(dev->card_label),
+			 "Dummy video device (0x%04X)", nr);
+	}
+	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name),
+		 "v4l2loopback-%03d", nr);
+
+	err = v4l2_device_register(NULL, &dev->v4l2_dev);
+	if (err)
+		goto out_free_idr;
+	MARK();
+
+	dev->vdev = video_device_alloc();
+	if (dev->vdev == NULL) {
+		err = -ENOMEM;
+		goto out_unregister;
+	}
+
+	vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL);
+	if (vdev_priv == NULL) {
+		err = -ENOMEM;
+		goto out_unregister;
+	}
+
+	video_set_drvdata(dev->vdev, vdev_priv);
+	if (video_get_drvdata(dev->vdev) == NULL) {
+		err = -ENOMEM;
+		goto out_unregister;
+	}
+
+	MARK();
+	snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s",
+		 dev->card_label);
+
+	vdev_priv->device_nr = nr;
+
+	init_vdev(dev->vdev, nr);
+	dev->vdev->v4l2_dev = &dev->v4l2_dev;
+	init_capture_param(&dev->capture_param);
+	err = set_timeperframe(dev, &dev->capture_param.timeperframe);
+	if (err)
+		goto out_unregister;
+	dev->keep_format = 0;
+	dev->sustain_framerate = 0;
+
+	dev->announce_all_caps = _announce_all_caps;
+	dev->min_width = _min_width;
+	dev->min_height = _min_height;
+	dev->max_width = _max_width;
+	dev->max_height = _max_height;
+	dev->max_openers = _max_openers;
+	dev->buffers_number = dev->used_buffers = _max_buffers;
+
+	dev->write_position = 0;
+
+	MARK();
+	spin_lock_init(&dev->lock);
+	spin_lock_init(&dev->list_lock);
+	INIT_LIST_HEAD(&dev->outbufs_list);
+	if (list_empty(&dev->outbufs_list)) {
+		int i;
+
+		for (i = 0; i < dev->used_buffers; ++i)
+			list_add_tail(&dev->buffers[i].list_head,
+				      &dev->outbufs_list);
+	}
+	memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index));
+	atomic_set(&dev->open_count, 0);
+	dev->ready_for_capture = 0;
+	dev->ready_for_output = 1;
+
+	dev->buffer_size = 0;
+	dev->image = NULL;
+	dev->imagesize = 0;
+#ifdef HAVE_TIMER_SETUP
+	timer_setup(&dev->sustain_timer, sustain_timer_clb, 0);
+	timer_setup(&dev->timeout_timer, timeout_timer_clb, 0);
+#else
+	setup_timer(&dev->sustain_timer, sustain_timer_clb, nr);
+	setup_timer(&dev->timeout_timer, timeout_timer_clb, nr);
+#endif
+	dev->reread_count = 0;
+	dev->timeout_jiffies = 0;
+	dev->timeout_image = NULL;
+	dev->timeout_happened = 0;
+
+	hdl = &dev->ctrl_handler;
+	err = v4l2_ctrl_handler_init(hdl, 4);
+	if (err)
+		goto out_unregister;
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL);
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL);
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL);
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL);
+	if (hdl->error) {
+		err = hdl->error;
+		goto out_free_handler;
+	}
+	dev->v4l2_dev.ctrl_handler = hdl;
+
+	err = v4l2_ctrl_handler_setup(hdl);
+	if (err)
+		goto out_free_handler;
+
+	/* FIXME set buffers to 0 */
+
+	/* Set initial format */
+	if (_width < _min_width)
+		_width = _min_width;
+	if (_width > _max_width)
+		_width = _max_width;
+	if (_height < _min_height)
+		_height = _min_height;
+	if (_height > _max_height)
+		_height = _max_height;
+
+	dev->pix_format.width = _width;
+	dev->pix_format.height = _height;
+	dev->pix_format.pixelformat = formats[0].fourcc;
+	dev->pix_format.colorspace =
+		V4L2_COLORSPACE_DEFAULT; /* do we need to set this ? */
+	dev->pix_format.field = V4L2_FIELD_NONE;
+
+	dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage);
+	dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size,
+		dev->pix_format.sizeimage);
+
+	if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0))
+		goto out_free_handler;
+
+	init_waitqueue_head(&dev->read_event);
+
+	/* register the device -> it creates /dev/video* */
+	if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) {
+		printk(KERN_ERR
+		       "v4l2loopback: failed video_register_device()\n");
+		err = -EFAULT;
+		goto out_free_device;
+	}
+	v4l2loopback_create_sysfs(dev->vdev);
+
+	MARK();
+	if (ret_nr)
+		*ret_nr = dev->vdev->num;
+	return 0;
+
+out_free_device:
+	video_device_release(dev->vdev);
+out_free_handler:
+	v4l2_ctrl_handler_free(&dev->ctrl_handler);
+out_unregister:
+	video_set_drvdata(dev->vdev, NULL);
+	if (vdev_priv != NULL)
+		kfree(vdev_priv);
+	v4l2_device_unregister(&dev->v4l2_dev);
+out_free_idr:
+	idr_remove(&v4l2loopback_index_idr, nr);
+out_free_dev:
+	kfree(dev);
+	return err;
+}
+
+static void v4l2_loopback_remove(struct v4l2_loopback_device *dev)
+{
+	free_buffers(dev);
+	v4l2loopback_remove_sysfs(dev->vdev);
+	kfree(video_get_drvdata(dev->vdev));
+	video_unregister_device(dev->vdev);
+	v4l2_device_unregister(&dev->v4l2_dev);
+	v4l2_ctrl_handler_free(&dev->ctrl_handler);
+	kfree(dev);
+}
+
+static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd,
+				       unsigned long parm)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_config conf;
+	struct v4l2_loopback_config *confptr = &conf;
+	int device_nr, capture_nr, output_nr;
+	int ret;
+
+	ret = mutex_lock_killable(&v4l2loopback_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	switch (cmd) {
+	default:
+		ret = -ENOSYS;
+		break;
+		/* add a v4l2loopback device (pair), based on the user-provided specs */
+	case V4L2LOOPBACK_CTL_ADD:
+		if (parm) {
+			if ((ret = copy_from_user(&conf, (void *)parm,
+						  sizeof(conf))) < 0)
+				break;
+		} else
+			confptr = NULL;
+		ret = v4l2_loopback_add(confptr, &device_nr);
+		if (ret >= 0)
+			ret = device_nr;
+		break;
+		/* remove a v4l2loopback device (both capture and output) */
+	case V4L2LOOPBACK_CTL_REMOVE:
+		ret = v4l2loopback_lookup((int)parm, &dev);
+		if (ret >= 0 && dev) {
+			int nr = ret;
+			ret = -EBUSY;
+			if (dev->open_count.counter > 0)
+				break;
+			idr_remove(&v4l2loopback_index_idr, nr);
+			v4l2_loopback_remove(dev);
+			ret = 0;
+		};
+		break;
+		/* get information for a loopback device.
+                 * this is mostly about limits (which cannot be queried directly with  VIDIOC_G_FMT and friends
+                 */
+	case V4L2LOOPBACK_CTL_QUERY:
+		if (!parm)
+			break;
+		if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) <
+		    0)
+			break;
+		capture_nr = output_nr = conf.output_nr;
+#ifdef SPLIT_DEVICES
+		capture_nr = conf.capture_nr;
+#endif
+		device_nr = (output_nr < 0) ? capture_nr : output_nr;
+		MARK();
+		/* get the device from either capture_nr or output_nr (whatever is valid) */
+		if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0)
+			break;
+		MARK();
+		/* if we got the device from output_nr and there is a valid capture_nr,
+                 * make sure that both refer to the same device (or bail out)
+                 */
+		if ((device_nr != capture_nr) && (capture_nr >= 0) &&
+		    ((ret = v4l2loopback_lookup(capture_nr, 0)) < 0))
+			break;
+		MARK();
+		/* if otoh, we got the device from capture_nr and there is a valid output_nr,
+                 * make sure that both refer to the same device (or bail out)
+                 */
+		if ((device_nr != output_nr) && (output_nr >= 0) &&
+		    ((ret = v4l2loopback_lookup(output_nr, 0)) < 0))
+			break;
+		MARK();
+
+		/* v4l2_loopback_config identified a single device, so fetch the data */
+		snprintf(conf.card_label, sizeof(conf.card_label), "%s",
+			 dev->card_label);
+		MARK();
+		conf.output_nr = dev->vdev->num;
+#ifdef SPLIT_DEVICES
+		conf.capture_nr = dev->vdev->num;
+#endif
+		conf.min_width = dev->min_width;
+		conf.min_height = dev->min_height;
+		conf.max_width = dev->max_width;
+		conf.max_height = dev->max_height;
+		conf.announce_all_caps = dev->announce_all_caps;
+		conf.max_buffers = dev->buffers_number;
+		conf.max_openers = dev->max_openers;
+		conf.debug = debug;
+		MARK();
+		if (copy_to_user((void *)parm, &conf, sizeof(conf))) {
+			ret = -EFAULT;
+			break;
+		}
+		MARK();
+		ret = 0;
+		;
+		break;
+	}
+
+	MARK();
+	mutex_unlock(&v4l2loopback_ctl_mutex);
+	MARK();
+	return ret;
+}
+
+/* LINUX KERNEL */
+
+static const struct file_operations v4l2loopback_ctl_fops = {
+	// clang-format off
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= v4l2loopback_control_ioctl,
+	.compat_ioctl	= v4l2loopback_control_ioctl,
+	.llseek		= noop_llseek,
+	// clang-format on
+};
+
+static struct miscdevice v4l2loopback_misc = {
+	// clang-format off
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "v4l2loopback",
+	.fops		= &v4l2loopback_ctl_fops,
+	// clang-format on
+};
+
+static const struct v4l2_file_operations v4l2_loopback_fops = {
+	// clang-format off
+	.owner		= THIS_MODULE,
+	.open		= v4l2_loopback_open,
+	.release	= v4l2_loopback_close,
+	.read		= v4l2_loopback_read,
+	.write		= v4l2_loopback_write,
+	.poll		= v4l2_loopback_poll,
+	.mmap		= v4l2_loopback_mmap,
+	.unlocked_ioctl	= video_ioctl2,
+	// clang-format on
+};
+
+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = {
+	// clang-format off
+	.vidioc_querycap		= &vidioc_querycap,
+	.vidioc_enum_framesizes		= &vidioc_enum_framesizes,
+	.vidioc_enum_frameintervals	= &vidioc_enum_frameintervals,
+
+	.vidioc_enum_output		= &vidioc_enum_output,
+	.vidioc_g_output		= &vidioc_g_output,
+	.vidioc_s_output		= &vidioc_s_output,
+
+	.vidioc_enum_input		= &vidioc_enum_input,
+	.vidioc_g_input			= &vidioc_g_input,
+	.vidioc_s_input			= &vidioc_s_input,
+
+	.vidioc_enum_fmt_vid_cap	= &vidioc_enum_fmt_cap,
+	.vidioc_g_fmt_vid_cap		= &vidioc_g_fmt_cap,
+	.vidioc_s_fmt_vid_cap		= &vidioc_s_fmt_cap,
+	.vidioc_try_fmt_vid_cap		= &vidioc_try_fmt_cap,
+
+	.vidioc_enum_fmt_vid_out	= &vidioc_enum_fmt_out,
+	.vidioc_s_fmt_vid_out		= &vidioc_s_fmt_out,
+	.vidioc_g_fmt_vid_out		= &vidioc_g_fmt_out,
+	.vidioc_try_fmt_vid_out		= &vidioc_try_fmt_out,
+
+#ifdef V4L2L_OVERLAY
+	.vidioc_s_fmt_vid_overlay	= &vidioc_s_fmt_overlay,
+	.vidioc_g_fmt_vid_overlay	= &vidioc_g_fmt_overlay,
+#endif
+
+#ifdef V4L2LOOPBACK_WITH_STD
+	.vidioc_s_std			= &vidioc_s_std,
+	.vidioc_g_std			= &vidioc_g_std,
+	.vidioc_querystd		= &vidioc_querystd,
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	.vidioc_g_parm			= &vidioc_g_parm,
+	.vidioc_s_parm			= &vidioc_s_parm,
+
+	.vidioc_reqbufs			= &vidioc_reqbufs,
+	.vidioc_querybuf		= &vidioc_querybuf,
+	.vidioc_qbuf			= &vidioc_qbuf,
+	.vidioc_dqbuf			= &vidioc_dqbuf,
+
+	.vidioc_streamon		= &vidioc_streamon,
+	.vidioc_streamoff		= &vidioc_streamoff,
+
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+	.vidiocgmbuf			= &vidiocgmbuf,
+#endif
+
+	.vidioc_subscribe_event		= &vidioc_subscribe_event,
+	.vidioc_unsubscribe_event	= &v4l2_event_unsubscribe,
+	// clang-format on
+};
+
+static int free_device_cb(int id, void *ptr, void *data)
+{
+	struct v4l2_loopback_device *dev = ptr;
+	v4l2_loopback_remove(dev);
+	return 0;
+}
+static void free_devices(void)
+{
+	idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL);
+	idr_destroy(&v4l2loopback_index_idr);
+}
+
+static int __init v4l2loopback_init_module(void)
+{
+	const u32 min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH;
+	const u32 min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT;
+	int err;
+	int i;
+	MARK();
+
+	err = misc_register(&v4l2loopback_misc);
+	if (err < 0)
+		return err;
+
+	if (devices < 0) {
+		devices = 1;
+
+		/* try guessing the devices from the "video_nr" parameter */
+		for (i = MAX_DEVICES - 1; i >= 0; i--) {
+			if (video_nr[i] >= 0) {
+				devices = i + 1;
+				break;
+			}
+		}
+	}
+
+	if (devices > MAX_DEVICES) {
+		devices = MAX_DEVICES;
+		printk(KERN_INFO
+		       "v4l2loopback: number of initial devices is limited to: %d\n",
+		       MAX_DEVICES);
+	}
+
+	if (max_buffers > MAX_BUFFERS) {
+		max_buffers = MAX_BUFFERS;
+		printk(KERN_INFO
+		       "v4l2loopback: number of buffers is limited to: %d\n",
+		       MAX_BUFFERS);
+	}
+
+	if (max_openers < 0) {
+		printk(KERN_INFO
+		       "v4l2loopback: allowing %d openers rather than %d\n",
+		       2, max_openers);
+		max_openers = 2;
+	}
+
+	if (max_width < min_width) {
+		max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
+		printk(KERN_INFO "v4l2loopback: using max_width %d\n",
+		       max_width);
+	}
+	if (max_height < min_height) {
+		max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
+		printk(KERN_INFO "v4l2loopback: using max_height %d\n",
+		       max_height);
+	}
+
+	for (i = 0; i < devices; i++) {
+		struct v4l2_loopback_config cfg = {
+			// clang-format off
+			.output_nr		= video_nr[i],
+#ifdef SPLIT_DEVICES
+			.capture_nr		= video_nr[i],
+#endif
+			.min_width		= min_width,
+			.min_height		= min_height,
+			.max_width		= max_width,
+			.max_height		= max_height,
+			.announce_all_caps	= (!exclusive_caps[i]),
+			.max_buffers		= max_buffers,
+			.max_openers		= max_openers,
+			.debug			= debug,
+			// clang-format on
+		};
+		cfg.card_label[0] = 0;
+		if (card_label[i])
+			snprintf(cfg.card_label, sizeof(cfg.card_label), "%s",
+				 card_label[i]);
+		err = v4l2_loopback_add(&cfg, 0);
+		if (err) {
+			free_devices();
+			goto error;
+		}
+	}
+
+	dprintk("module installed\n");
+
+	printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n",
+	       // clang-format off
+	       (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff,
+	       (V4L2LOOPBACK_VERSION_CODE >>  8) & 0xff,
+	       (V4L2LOOPBACK_VERSION_CODE      ) & 0xff,
+#ifdef SNAPSHOT_VERSION
+	       " (" __stringify(SNAPSHOT_VERSION) ")"
+#else
+	       ""
+#endif
+	       );
+	// clang-format on
+
+	return 0;
+error:
+	misc_deregister(&v4l2loopback_misc);
+	return err;
+}
+
+static void v4l2loopback_cleanup_module(void)
+{
+	MARK();
+	/* unregister the device -> it deletes /dev/video* */
+	free_devices();
+	/* and get rid of /dev/v4l2loopback */
+	misc_deregister(&v4l2loopback_misc);
+	dprintk("module removed\n");
+}
+
+MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR);
+
+module_init(v4l2loopback_init_module);
+module_exit(v4l2loopback_cleanup_module);
diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h
new file mode 100644
index 000000000000..1bc7e6b747a4
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2loopback.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * v4l2loopback.h
+ *
+ * Written by IOhannes m zmölnig, 7/1/20.
+ *
+ * Copyright 2020 by IOhannes m zmölnig.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ */
+#ifndef _V4L2LOOPBACK_H
+#define _V4L2LOOPBACK_H
+
+#define V4L2LOOPBACK_VERSION_MAJOR 0
+#define V4L2LOOPBACK_VERSION_MINOR 13
+#define V4L2LOOPBACK_VERSION_BUGFIX 1
+
+/* /dev/v4l2loopback interface */
+
+struct v4l2_loopback_config {
+	/**
+         * the device-number (/dev/video<nr>)
+         * V4L2LOOPBACK_CTL_ADD:
+         * setting this to a value<0, will allocate an available one
+         * if nr>=0 and the device already exists, the ioctl will EEXIST
+         * if output_nr and capture_nr are the same, only a single device will be created
+	 * NOTE: currently split-devices (where output_nr and capture_nr differ)
+	 *   are not implemented yet.
+	 *   until then, requesting different device-IDs will result in EINVAL.
+         *
+         * V4L2LOOPBACK_CTL_QUERY:
+         * either both output_nr and capture_nr must refer to the same loopback,
+         * or one (and only one) of them must be -1
+         *
+         */
+	int output_nr;
+	int unused; /*capture_nr;*/
+
+	/**
+         * a nice name for your device
+         * if (*card_label)==0, an automatic name is assigned
+         */
+	char card_label[32];
+
+	/**
+         * allowed frame size
+         * if too low, default values are used
+         */
+	unsigned int min_width;
+	unsigned int max_width;
+	unsigned int min_height;
+	unsigned int max_height;
+
+	/**
+         * number of buffers to allocate for the queue
+         * if set to <=0, default values are used
+         */
+	int max_buffers;
+
+	/**
+         * how many consumers are allowed to open this device concurrently
+         * if set to <=0, default values are used
+         */
+	int max_openers;
+
+	/**
+         * set the debugging level for this device
+         */
+	int debug;
+
+	/**
+         * whether to announce OUTPUT/CAPTURE capabilities exclusively
+         * for this device or not
+         * (!exclusive_caps)
+	 * NOTE: this is going to be removed once separate output/capture
+	 *       devices are implemented
+         */
+	int announce_all_caps;
+};
+
+/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the
+ * to-be-created device set.
+ * if the ptr is NULL, a new device is created with default values at the driver's discretion.
+ *
+ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY,
+ * to get more information on the device)
+ */
+#define V4L2LOOPBACK_CTL_ADD 0x4C80
+
+/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set
+ * (the two values must either refer to video-devices associated with the same loopback device
+ *  or exactly one of them must be <0
+ */
+#define V4L2LOOPBACK_CTL_QUERY 0x4C82
+
+/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */
+#define V4L2LOOPBACK_CTL_REMOVE 0x4C81
+
+#endif /* _V4L2LOOPBACK_H */
diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h
new file mode 100644
index 000000000000..d855a3796554
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2loopback_formats.h
@@ -0,0 +1,445 @@
+static const struct v4l2l_format formats[] = {
+#ifndef V4L2_PIX_FMT_VP9
+#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0')
+#endif
+#ifndef V4L2_PIX_FMT_HEVC
+#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C')
+#endif
+
+	/* here come the packed formats */
+	{
+		.name = "32 bpp RGB, le",
+		.fourcc = V4L2_PIX_FMT_BGR32,
+		.depth = 32,
+		.flags = 0,
+	},
+	{
+		.name = "32 bpp RGB, be",
+		.fourcc = V4L2_PIX_FMT_RGB32,
+		.depth = 32,
+		.flags = 0,
+	},
+	{
+		.name = "24 bpp RGB, le",
+		.fourcc = V4L2_PIX_FMT_BGR24,
+		.depth = 24,
+		.flags = 0,
+	},
+	{
+		.name = "24 bpp RGB, be",
+		.fourcc = V4L2_PIX_FMT_RGB24,
+		.depth = 24,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_ABGR32
+	{
+		.name = "32 bpp RGBA, le",
+		.fourcc = V4L2_PIX_FMT_ABGR32,
+		.depth = 32,
+		.flags = 0,
+	},
+#endif
+#ifdef V4L2_PIX_FMT_RGBA32
+	{
+		.name = "32 bpp RGBA",
+		.fourcc = V4L2_PIX_FMT_RGBA32,
+		.depth = 32,
+		.flags = 0,
+	},
+#endif
+#ifdef V4L2_PIX_FMT_RGB332
+	{
+		.name = "8 bpp RGB-3-3-2",
+		.fourcc = V4L2_PIX_FMT_RGB332,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB332 */
+#ifdef V4L2_PIX_FMT_RGB444
+	{
+		.name = "16 bpp RGB (xxxxrrrr ggggbbbb)",
+		.fourcc = V4L2_PIX_FMT_RGB444,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB444 */
+#ifdef V4L2_PIX_FMT_RGB555
+	{
+		.name = "16 bpp RGB-5-5-5",
+		.fourcc = V4L2_PIX_FMT_RGB555,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB555 */
+#ifdef V4L2_PIX_FMT_RGB565
+	{
+		.name = "16 bpp RGB-5-6-5",
+		.fourcc = V4L2_PIX_FMT_RGB565,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB565 */
+#ifdef V4L2_PIX_FMT_RGB555X
+	{
+		.name = "16 bpp RGB-5-5-5 BE",
+		.fourcc = V4L2_PIX_FMT_RGB555X,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB555X */
+#ifdef V4L2_PIX_FMT_RGB565X
+	{
+		.name = "16 bpp RGB-5-6-5 BE",
+		.fourcc = V4L2_PIX_FMT_RGB565X,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB565X */
+#ifdef V4L2_PIX_FMT_BGR666
+	{
+		.name = "18 bpp BGR-6-6-6",
+		.fourcc = V4L2_PIX_FMT_BGR666,
+		.depth = 18,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_BGR666 */
+	{
+		.name = "4:2:2, packed, YUYV",
+		.fourcc = V4L2_PIX_FMT_YUYV,
+		.depth = 16,
+		.flags = 0,
+	},
+	{
+		.name = "4:2:2, packed, UYVY",
+		.fourcc = V4L2_PIX_FMT_UYVY,
+		.depth = 16,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_YVYU
+	{
+		.name = "4:2:2, packed YVYU",
+		.fourcc = V4L2_PIX_FMT_YVYU,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif
+#ifdef V4L2_PIX_FMT_VYUY
+	{
+		.name = "4:2:2, packed VYUY",
+		.fourcc = V4L2_PIX_FMT_VYUY,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif
+	{
+		.name = "4:2:2, packed YYUV",
+		.fourcc = V4L2_PIX_FMT_YYUV,
+		.depth = 16,
+		.flags = 0,
+	},
+	{
+		.name = "YUV-8-8-8-8",
+		.fourcc = V4L2_PIX_FMT_YUV32,
+		.depth = 32,
+		.flags = 0,
+	},
+	{
+		.name = "8 bpp, Greyscale",
+		.fourcc = V4L2_PIX_FMT_GREY,
+		.depth = 8,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_Y4
+	{
+		.name = "4 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y4,
+		.depth = 4,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y4 */
+#ifdef V4L2_PIX_FMT_Y6
+	{
+		.name = "6 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y6,
+		.depth = 6,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y6 */
+#ifdef V4L2_PIX_FMT_Y10
+	{
+		.name = "10 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y10,
+		.depth = 10,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y10 */
+#ifdef V4L2_PIX_FMT_Y12
+	{
+		.name = "12 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y12,
+		.depth = 12,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y12 */
+	{
+		.name = "16 bpp, Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y16,
+		.depth = 16,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_YUV444
+	{
+		.name = "16 bpp xxxxyyyy uuuuvvvv",
+		.fourcc = V4L2_PIX_FMT_YUV444,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_YUV444 */
+#ifdef V4L2_PIX_FMT_YUV555
+	{
+		.name = "16 bpp YUV-5-5-5",
+		.fourcc = V4L2_PIX_FMT_YUV555,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_YUV555 */
+#ifdef V4L2_PIX_FMT_YUV565
+	{
+		.name = "16 bpp YUV-5-6-5",
+		.fourcc = V4L2_PIX_FMT_YUV565,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_YUV565 */
+
+/* bayer formats */
+#ifdef V4L2_PIX_FMT_SRGGB8
+	{
+		.name = "Bayer RGGB 8bit",
+		.fourcc = V4L2_PIX_FMT_SRGGB8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SRGGB8 */
+#ifdef V4L2_PIX_FMT_SGRBG8
+	{
+		.name = "Bayer GRBG 8bit",
+		.fourcc = V4L2_PIX_FMT_SGRBG8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SGRBG8 */
+#ifdef V4L2_PIX_FMT_SGBRG8
+	{
+		.name = "Bayer GBRG 8bit",
+		.fourcc = V4L2_PIX_FMT_SGBRG8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SGBRG8 */
+#ifdef V4L2_PIX_FMT_SBGGR8
+	{
+		.name = "Bayer BA81 8bit",
+		.fourcc = V4L2_PIX_FMT_SBGGR8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SBGGR8 */
+
+	/* here come the planar formats */
+	{
+		.name = "4:1:0, planar, Y-Cr-Cb",
+		.fourcc = V4L2_PIX_FMT_YVU410,
+		.depth = 9,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+	{
+		.name = "4:2:0, planar, Y-Cr-Cb",
+		.fourcc = V4L2_PIX_FMT_YVU420,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+	{
+		.name = "4:1:0, planar, Y-Cb-Cr",
+		.fourcc = V4L2_PIX_FMT_YUV410,
+		.depth = 9,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+	{
+		.name = "4:2:0, planar, Y-Cb-Cr",
+		.fourcc = V4L2_PIX_FMT_YUV420,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#ifdef V4L2_PIX_FMT_YUV422P
+	{
+		.name = "16 bpp YVU422 planar",
+		.fourcc = V4L2_PIX_FMT_YUV422P,
+		.depth = 16,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_YUV422P */
+#ifdef V4L2_PIX_FMT_YUV411P
+	{
+		.name = "16 bpp YVU411 planar",
+		.fourcc = V4L2_PIX_FMT_YUV411P,
+		.depth = 16,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_YUV411P */
+#ifdef V4L2_PIX_FMT_Y41P
+	{
+		.name = "12 bpp YUV 4:1:1",
+		.fourcc = V4L2_PIX_FMT_Y41P,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_Y41P */
+#ifdef V4L2_PIX_FMT_NV12
+	{
+		.name = "12 bpp Y/CbCr 4:2:0 ",
+		.fourcc = V4L2_PIX_FMT_NV12,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_NV12 */
+
+/* here come the compressed formats */
+
+#ifdef V4L2_PIX_FMT_MJPEG
+	{
+		.name = "Motion-JPEG",
+		.fourcc = V4L2_PIX_FMT_MJPEG,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MJPEG */
+#ifdef V4L2_PIX_FMT_JPEG
+	{
+		.name = "JFIF JPEG",
+		.fourcc = V4L2_PIX_FMT_JPEG,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_JPEG */
+#ifdef V4L2_PIX_FMT_DV
+	{
+		.name = "DV1394",
+		.fourcc = V4L2_PIX_FMT_DV,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_DV */
+#ifdef V4L2_PIX_FMT_MPEG
+	{
+		.name = "MPEG-1/2/4 Multiplexed",
+		.fourcc = V4L2_PIX_FMT_MPEG,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG */
+#ifdef V4L2_PIX_FMT_H264
+	{
+		.name = "H264 with start codes",
+		.fourcc = V4L2_PIX_FMT_H264,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H264 */
+#ifdef V4L2_PIX_FMT_H264_NO_SC
+	{
+		.name = "H264 without start codes",
+		.fourcc = V4L2_PIX_FMT_H264_NO_SC,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H264_NO_SC */
+#ifdef V4L2_PIX_FMT_H264_MVC
+	{
+		.name = "H264 MVC",
+		.fourcc = V4L2_PIX_FMT_H264_MVC,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H264_MVC */
+#ifdef V4L2_PIX_FMT_H263
+	{
+		.name = "H263",
+		.fourcc = V4L2_PIX_FMT_H263,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H263 */
+#ifdef V4L2_PIX_FMT_MPEG1
+	{
+		.name = "MPEG-1 ES",
+		.fourcc = V4L2_PIX_FMT_MPEG1,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG1 */
+#ifdef V4L2_PIX_FMT_MPEG2
+	{
+		.name = "MPEG-2 ES",
+		.fourcc = V4L2_PIX_FMT_MPEG2,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG2 */
+#ifdef V4L2_PIX_FMT_MPEG4
+	{
+		.name = "MPEG-4 part 2 ES",
+		.fourcc = V4L2_PIX_FMT_MPEG4,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG4 */
+#ifdef V4L2_PIX_FMT_XVID
+	{
+		.name = "Xvid",
+		.fourcc = V4L2_PIX_FMT_XVID,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_XVID */
+#ifdef V4L2_PIX_FMT_VC1_ANNEX_G
+	{
+		.name = "SMPTE 421M Annex G compliant stream",
+		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_G,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */
+#ifdef V4L2_PIX_FMT_VC1_ANNEX_L
+	{
+		.name = "SMPTE 421M Annex L compliant stream",
+		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_L,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */
+#ifdef V4L2_PIX_FMT_VP8
+	{
+		.name = "VP8",
+		.fourcc = V4L2_PIX_FMT_VP8,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VP8 */
+#ifdef V4L2_PIX_FMT_VP9
+	{
+		.name = "VP9",
+		.fourcc = V4L2_PIX_FMT_VP9,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VP9 */
+#ifdef V4L2_PIX_FMT_HEVC
+	{
+		.name = "HEVC",
+		.fourcc = V4L2_PIX_FMT_HEVC,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_HEVC */
+};
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index 038ccbd9e3ba..de5e4f5145af 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
 obj-$(CONFIG_PCIE_CADENCE) += cadence/
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
new file mode 100644
index 000000000000..e105e6f5cc91
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+	struct pci_dev		*dev;		/* AHCI device */
+	struct pci_bus		*bus;		/* our fake PCI bus */
+	struct pci_sysdata	sysdata;
+	int			irq_base;	/* our fake interrupts */
+
+	/*
+	 * When we detect an all-ones write to a BAR register, this flag
+	 * is set, so that we return the BAR size on the next read (a
+	 * standard PCI behaviour).
+	 * This includes the assumption that an all-ones BAR write is
+	 * immediately followed by a read of the same register.
+	 */
+	bool			bar_sizing;
+
+	/*
+	 * Resources copied from the AHCI device, to be regarded as
+	 * resources on our fake bus.
+	 */
+	struct resource		ahci_resources[PCI_NUM_RESOURCES];
+
+	/* Resources corresponding to the NVMe devices. */
+	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
+
+	/* Number of remapped NVMe devices found. */
+	int			num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+		if (reg <= fixup_reg && fixup_reg < reg + len) \
+			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+	} while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+	} while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+		NR_FIX16(fixup_reg, (u16) fixed_value); \
+		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+				     int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+	int ret;
+
+	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+				      reg, len, value);
+	if (ret)
+		return ret;
+
+	/*
+	 * Adjust the device class, to prevent this driver from attempting to
+	 * additionally probe the device we're simulating here.
+	 */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+	/*
+	 * Unset interrupt pin, otherwise ACPI tries to find routing
+	 * info for our virtual IRQ, fails, and complains.
+	 */
+	NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+	/*
+	 * Truncate the AHCI BAR to not include the region that covers the
+	 * hidden devices. This will cause the ahci driver to successfully
+	 * probe th new device (instead of handing it over to this driver).
+	 */
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+		nrdev->bar_sizing = false;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+					int reg, int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct resource *remapped_mem;
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	*value = 0;
+	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+	/* Set a Vendor ID, otherwise Linux assumes no device is present */
+	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+	/* Always appear on & bus mastering */
+	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Set class so that nvme driver probes us */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_0,
+			 ~(resource_size(remapped_mem) - 1));
+		nrdev->bar_sizing = false;
+	} else {
+		resource_size_t mem_start = remapped_mem->start;
+
+		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+		mem_start >>= 32;
+		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+			       int reg, int len, u32 *value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_read_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+						    reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+				      int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+		/*
+		 * Writing all-ones to a BAR means that the size of the
+		 * memory region is being checked. Flag this so that we can
+		 * reply with an appropriate size on the next read.
+		 */
+		if (value == ~0)
+			nrdev->bar_sizing = true;
+
+		return ahci_dev_bus->ops->write(ahci_dev_bus,
+						nrdev->dev->devfn,
+						reg, len, value);
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+					 unsigned int port,
+					 int reg, int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Writing all-ones to a BAR means that the size of the memory
+	 * region is being checked. Flag this so that we can reply with
+	 * an appropriate size on the next read.
+	 */
+	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+			&& reg <= PCI_BASE_ADDRESS_5) {
+		nrdev->bar_sizing = true;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+				int reg, int len, u32 value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_write_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+						     reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+	.read	= nvme_remap_pci_read,
+	.write	= nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+	int domain = 0xffff;
+	struct pci_bus *bus = NULL;
+
+	while ((bus = pci_find_next_bus(bus)) != NULL)
+		domain = max_t(int, domain, pci_domain_nr(bus));
+
+	return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+				 struct list_head *resources)
+{
+	void __iomem *mmio;
+	int i, count = 0;
+	u32 cap;
+
+	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+			  pci_resource_len(nrdev->dev,
+					   AHCI_PCI_BAR_STANDARD));
+	if (!mmio)
+		return -ENODEV;
+
+	/* Check if this device might have remapped nvme devices. */
+	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+	    !(readl(mmio + AHCI_VSCAP) & 1))
+		return -ENODEV;
+
+	cap = readq(mmio + AHCI_REMAP_CAP);
+	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+		struct resource *remapped_mem;
+
+		if ((cap & (1 << i)) == 0)
+			continue;
+		if (readl(mmio + ahci_remap_dcc(i))
+				!= PCI_CLASS_STORAGE_EXPRESS)
+			continue;
+
+		/* We've found a remapped device */
+		remapped_mem = &nrdev->remapped_dev_mem[count++];
+		remapped_mem->start =
+			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+			+ ahci_remap_base(i);
+		remapped_mem->end = remapped_mem->start
+			+ AHCI_REMAP_N_SIZE - 1;
+		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+		pci_add_resource(resources, remapped_mem);
+	}
+
+	pcim_iounmap(nrdev->dev, mmio);
+
+	if (count == 0)
+		return -ENODEV;
+
+	nrdev->num_remapped_devices = count;
+	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+		 nrdev->num_remapped_devices);
+	return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+	struct pci_bus *bus = data;
+
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+			    const struct pci_device_id *id)
+{
+	struct nvme_remap_dev *nrdev;
+	LIST_HEAD(resources);
+	int i;
+	int ret;
+	struct pci_dev *child;
+
+	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+	nrdev->sysdata.domain = find_free_domain();
+	nrdev->sysdata.nvme_remap_dev = dev;
+	nrdev->dev = dev;
+	pci_set_drvdata(dev, nrdev);
+
+	ret = pcim_enable_device(dev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(dev);
+
+	ret = find_remapped_devices(nrdev, &resources);
+	if (ret)
+		return ret;
+
+	/* Add resources from the original AHCI device */
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		if (res->start) {
+			struct resource *nr_res = &nrdev->ahci_resources[i];
+
+			nr_res->start = res->start;
+			nr_res->end = res->end;
+			nr_res->flags = res->flags;
+			pci_add_resource(&resources, nr_res);
+		}
+	}
+
+	/* Create virtual interrupts */
+	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+					       nrdev->num_remapped_devices + 1,
+					       0);
+	if (nrdev->irq_base < 0)
+		return nrdev->irq_base;
+
+	/* Create and populate PCI bus */
+	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+					 &nrdev->sysdata, &resources);
+	if (!nrdev->bus)
+		return -ENODEV;
+
+	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+				     nrdev->bus))
+		return -ENOMEM;
+
+	/* We don't support sharing MSI interrupts between these devices */
+	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+	pci_scan_child_bus(nrdev->bus);
+
+	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+		/*
+		 * Prevent PCI core from trying to move memory BARs around.
+		 * The hidden NVMe devices are at fixed locations.
+		 */
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *res = &child->resource[i];
+
+			if (res->flags & IORESOURCE_MEM)
+				res->flags |= IORESOURCE_PCI_FIXED;
+		}
+
+		/* Share the legacy IRQ between all devices */
+		child->irq = dev->irq;
+	}
+
+	pci_assign_unassigned_bus_resources(nrdev->bus);
+	pci_bus_add_devices(nrdev->bus);
+
+	return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+	/*
+	 * Match all Intel RAID controllers.
+	 *
+	 * There's overlap here with the set of devices detected by the ahci
+	 * driver, but ahci will only successfully probe when there
+	 * *aren't* any remapped NVMe devices, and this driver will only
+	 * successfully probe when there *are* remapped NVMe devices that
+	 * need handling.
+	 */
+	{
+		PCI_VDEVICE(INTEL, PCI_ANY_ID),
+		.class = PCI_CLASS_STORAGE_RAID << 8,
+		.class_mask = 0xffffff00,
+	},
+	{0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+	.name		= MODULE_NAME,
+	.id_table	= nvme_remap_ids,
+	.probe		= nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 0a1f668999ce..d4163fa9c27a 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3746,6 +3746,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
 }
 
+static bool acs_on_downstream;
+static bool acs_on_multifunction;
+
+#define NUM_ACS_IDS 16
+struct acs_on_id {
+	unsigned short vendor;
+	unsigned short device;
+};
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
+static u8 max_acs_id;
+
+static __init int pcie_acs_override_setup(char *p)
+{
+	if (!p)
+		return -EINVAL;
+
+	while (*p) {
+		if (!strncmp(p, "downstream", 10))
+			acs_on_downstream = true;
+		if (!strncmp(p, "multifunction", 13))
+			acs_on_multifunction = true;
+		if (!strncmp(p, "id:", 3)) {
+			char opt[5];
+			int ret;
+			long val;
+
+			if (max_acs_id >= NUM_ACS_IDS - 1) {
+				pr_warn("Out of PCIe ACS override slots (%d)\n",
+						NUM_ACS_IDS);
+				goto next;
+			}
+
+			p += 3;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].vendor = val;
+
+			p += strcspn(p, ":");
+			if (*p != ':') {
+				pr_warn("PCIe ACS invalid ID\n");
+				goto next;
+			}
+
+			p++;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].device = val;
+			max_acs_id++;
+		}
+next:
+		p += strcspn(p, ",");
+		if (*p == ',')
+			p++;
+	}
+
+	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
+		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
+
+	return 0;
+}
+early_param("pcie_acs_override", pcie_acs_override_setup);
+
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
+{
+	int i;
+
+	/* Never override ACS for legacy devices or devices with ACS caps */
+	if (!pci_is_pcie(dev) ||
+		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
+			return -ENOTTY;
+
+	for (i = 0; i < max_acs_id; i++)
+		if (acs_on_ids[i].vendor == dev->vendor &&
+			acs_on_ids[i].device == dev->device)
+				return 1;
+
+	switch (pci_pcie_type(dev)) {
+	case PCI_EXP_TYPE_DOWNSTREAM:
+	case PCI_EXP_TYPE_ROOT_PORT:
+		if (acs_on_downstream)
+			return 1;
+		break;
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_UPSTREAM:
+	case PCI_EXP_TYPE_LEG_END:
+	case PCI_EXP_TYPE_RC_END:
+		if (acs_on_multifunction && dev->multifunction)
+			return 1;
+	}
+
+	return -ENOTTY;
+}
 /*
  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
  * prevented for those affected devices.
@@ -5170,6 +5270,7 @@ static const struct pci_dev_acs_enabled {
 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
 	/* Wangxun nics */
 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
+	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
 	{ 0 }
 };
 
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 37c24ffea65c..bd52d1e081b7 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1522,4 +1522,6 @@ endif # SCSI_LOWLEVEL
 
 source "drivers/scsi/device_handler/Kconfig"
 
+source "drivers/scsi/vhba/Kconfig"
+
 endmenu
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 1313ddf2fd1a..5942e8f79159 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -153,6 +153,7 @@ obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
 obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
 
 obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
+obj-$(CONFIG_VHBA)		+= vhba/
 
 # This goes last, so that "real" scsi devices probe earlier
 obj-$(CONFIG_SCSI_DEBUG)	+= scsi_debug.o
diff --git a/drivers/scsi/vhba/Kconfig b/drivers/scsi/vhba/Kconfig
new file mode 100644
index 000000000000..e70a381fe3df
--- /dev/null
+++ b/drivers/scsi/vhba/Kconfig
@@ -0,0 +1,9 @@
+config VHBA
+	tristate "Virtual (SCSI) Host Bus Adapter"
+	depends on SCSI
+	help
+	  This is the in-kernel part of CDEmu, a CD/DVD-ROM device
+	  emulator.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called vhba.
diff --git a/drivers/scsi/vhba/Makefile b/drivers/scsi/vhba/Makefile
new file mode 100644
index 000000000000..2d7524b66199
--- /dev/null
+++ b/drivers/scsi/vhba/Makefile
@@ -0,0 +1,4 @@
+VHBA_VERSION := 20240917
+
+obj-$(CONFIG_VHBA)		+= vhba.o
+ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
diff --git a/drivers/scsi/vhba/vhba.c b/drivers/scsi/vhba/vhba.c
new file mode 100644
index 000000000000..7531223355e5
--- /dev/null
+++ b/drivers/scsi/vhba/vhba.c
@@ -0,0 +1,1130 @@
+/*
+ * vhba.c
+ *
+ * Copyright (C) 2007-2012 Chia-I Wu <olvaffe AT gmail DOT com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define pr_fmt(fmt) "vhba: " fmt
+
+#include <linux/version.h>
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+#include <linux/sched/signal.h>
+#else
+#include <linux/sched.h>
+#endif
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <asm/uaccess.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_tcq.h>
+
+
+MODULE_AUTHOR("Chia-I Wu");
+MODULE_VERSION(VHBA_VERSION);
+MODULE_DESCRIPTION("Virtual SCSI HBA");
+MODULE_LICENSE("GPL");
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0)
+#define sdev_dbg(sdev, fmt, a...) \
+    dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)
+#define scmd_dbg(scmd, fmt, a...) \
+    dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a)
+#endif
+
+#define VHBA_MAX_SECTORS_PER_IO 256
+#define VHBA_MAX_BUS 16
+#define VHBA_MAX_ID 16
+#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1))
+#define VHBA_KBUF_SIZE PAGE_SIZE
+
+#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL)
+#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL)
+
+
+static int vhba_can_queue = 32;
+module_param_named(can_queue, vhba_can_queue, int, 0);
+
+
+enum vhba_req_state {
+    VHBA_REQ_FREE,
+    VHBA_REQ_PENDING,
+    VHBA_REQ_READING,
+    VHBA_REQ_SENT,
+    VHBA_REQ_WRITING,
+};
+
+struct vhba_command {
+    struct scsi_cmnd *cmd;
+    /* metatags are per-host. not to be confused with
+       queue tags that are usually per-lun */
+    unsigned long metatag;
+    int status;
+    struct list_head entry;
+};
+
+struct vhba_device {
+    unsigned int num;
+    spinlock_t cmd_lock;
+    struct list_head cmd_list;
+    wait_queue_head_t cmd_wq;
+    atomic_t refcnt;
+
+    unsigned char *kbuf;
+    size_t kbuf_size;
+};
+
+struct vhba_host {
+    struct Scsi_Host *shost;
+    spinlock_t cmd_lock;
+    int cmd_next;
+    struct vhba_command *commands;
+    spinlock_t dev_lock;
+    struct vhba_device *devices[VHBA_MAX_DEVICES];
+    int num_devices;
+    DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES);
+    int chgtype[VHBA_MAX_DEVICES];
+    struct work_struct scan_devices;
+};
+
+#define MAX_COMMAND_SIZE 16
+
+struct vhba_request {
+    __u32 metatag;
+    __u32 lun;
+    __u8 cdb[MAX_COMMAND_SIZE];
+    __u8 cdb_len;
+    __u32 data_len;
+};
+
+struct vhba_response {
+    __u32 metatag;
+    __u32 status;
+    __u32 data_len;
+};
+
+
+
+static struct vhba_command *vhba_alloc_command (void);
+static void vhba_free_command (struct vhba_command *vcmd);
+
+static struct platform_device vhba_platform_device;
+
+
+
+/* These functions define a symmetric 1:1 mapping between device numbers and
+   the bus and id. We have reserved the last id per bus for the host itself. */
+static void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id)
+{
+    *bus = devnum / (VHBA_MAX_ID-1);
+    *id  = devnum % (VHBA_MAX_ID-1);
+}
+
+static unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id)
+{
+    return (bus * (VHBA_MAX_ID-1)) + id;
+}
+
+static struct vhba_device *vhba_device_alloc (void)
+{
+    struct vhba_device *vdev;
+
+    vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL);
+    if (!vdev) {
+        return NULL;
+    }
+
+    spin_lock_init(&vdev->cmd_lock);
+    INIT_LIST_HEAD(&vdev->cmd_list);
+    init_waitqueue_head(&vdev->cmd_wq);
+    atomic_set(&vdev->refcnt, 1);
+
+    vdev->kbuf = NULL;
+    vdev->kbuf_size = 0;
+
+    return vdev;
+}
+
+static void vhba_device_put (struct vhba_device *vdev)
+{
+    if (atomic_dec_and_test(&vdev->refcnt)) {
+        kfree(vdev);
+    }
+}
+
+static struct vhba_device *vhba_device_get (struct vhba_device *vdev)
+{
+    atomic_inc(&vdev->refcnt);
+
+    return vdev;
+}
+
+static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
+{
+    struct vhba_host *vhost;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    vcmd = vhba_alloc_command();
+    if (!vcmd) {
+        return SCSI_MLQUEUE_HOST_BUSY;
+    }
+
+    vcmd->cmd = cmd;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+    vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag;
+#else
+    vcmd->metatag = vcmd->cmd->request->tag;
+#endif
+    list_add_tail(&vcmd->entry, &vdev->cmd_list);
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    wake_up_interruptible(&vdev->cmd_wq);
+
+    return 0;
+}
+
+static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
+{
+    struct vhba_command *vcmd;
+    int retval;
+    unsigned long flags;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->cmd == cmd) {
+            list_del_init(&vcmd->entry);
+            break;
+        }
+    }
+
+    /* command not found */
+    if (&vcmd->entry == &vdev->cmd_list) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        return SUCCESS;
+    }
+
+    while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        scmd_dbg(cmd, "wait for I/O before aborting\n");
+        schedule_timeout(1);
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+    }
+
+    retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS;
+
+    vhba_free_command(vcmd);
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return retval;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
+static int vhba_slave_alloc(struct scsi_device *sdev)
+{
+    struct Scsi_Host *shost = sdev->host;
+
+    sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
+    if (!shost_use_blk_mq(shost) && shost->bqt) {
+#else
+    if (shost->bqt) {
+#endif
+        blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt);
+    }
+    scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth);
+
+    return 0;
+}
+#endif
+
+static void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id)
+{
+    struct scsi_device *sdev;
+
+    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
+    if (!sdev) {
+        scsi_add_device(vhost->shost, bus, id, 0);
+    } else {
+        dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id);
+        scsi_device_put(sdev);
+    }
+}
+
+static void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id)
+{
+    struct scsi_device *sdev;
+
+    sdev = scsi_device_lookup(vhost->shost, bus, id, 0);
+    if (sdev) {
+        scsi_remove_device(sdev);
+        scsi_device_put(sdev);
+    } else {
+        dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id);
+    }
+}
+
+static void vhba_scan_devices (struct work_struct *work)
+{
+    struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices);
+    unsigned long flags;
+    int change, exists;
+    unsigned int devnum;
+    unsigned int bus, id;
+
+    for (;;) {
+        spin_lock_irqsave(&vhost->dev_lock, flags);
+
+        devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES);
+        if (devnum >= VHBA_MAX_DEVICES) {
+            spin_unlock_irqrestore(&vhost->dev_lock, flags);
+            break;
+        }
+        change = vhost->chgtype[devnum];
+        exists = vhost->devices[devnum] != NULL;
+
+        vhost->chgtype[devnum] = 0;
+        clear_bit(devnum, vhost->chgmap);
+
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+        devnum_to_bus_and_id(devnum, &bus, &id);
+
+        if (change < 0) {
+            dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id);
+            vhba_scan_devices_remove(vhost, bus, id);
+        } else if (change > 0) {
+            dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id);
+            vhba_scan_devices_add(vhost, bus, id);
+        } else {
+            /* quick sequence of add/remove or remove/add; we determine
+               which one it was by checking if device structure exists */
+            if (exists) {
+                /* remove followed by add: remove and (re)add */
+                dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id);
+                vhba_scan_devices_remove(vhost, bus, id);
+                vhba_scan_devices_add(vhost, bus, id);
+            } else {
+                /* add followed by remove: no-op */
+                dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id);
+            }
+        }
+    }
+}
+
+static int vhba_add_device (struct vhba_device *vdev)
+{
+    struct vhba_host *vhost;
+    unsigned int devnum;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    vhba_device_get(vdev);
+
+    spin_lock_irqsave(&vhost->dev_lock, flags);
+    if (vhost->num_devices >= VHBA_MAX_DEVICES) {
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+        vhba_device_put(vdev);
+        return -EBUSY;
+    }
+
+    for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) {
+        if (vhost->devices[devnum] == NULL) {
+            vdev->num = devnum;
+            vhost->devices[devnum] = vdev;
+            vhost->num_devices++;
+            set_bit(devnum, vhost->chgmap);
+            vhost->chgtype[devnum]++;
+            break;
+        }
+    }
+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+    schedule_work(&vhost->scan_devices);
+
+    return 0;
+}
+
+static int vhba_remove_device (struct vhba_device *vdev)
+{
+    struct vhba_host *vhost;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->dev_lock, flags);
+    set_bit(vdev->num, vhost->chgmap);
+    vhost->chgtype[vdev->num]--;
+    vhost->devices[vdev->num] = NULL;
+    vhost->num_devices--;
+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+    vhba_device_put(vdev);
+
+    schedule_work(&vhost->scan_devices);
+
+    return 0;
+}
+
+static struct vhba_device *vhba_lookup_device (int devnum)
+{
+    struct vhba_host *vhost;
+    struct vhba_device *vdev = NULL;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    if (likely(devnum < VHBA_MAX_DEVICES)) {
+        spin_lock_irqsave(&vhost->dev_lock, flags);
+        vdev = vhost->devices[devnum];
+        if (vdev) {
+            vdev = vhba_device_get(vdev);
+        }
+
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+    }
+
+    return vdev;
+}
+
+static struct vhba_command *vhba_alloc_command (void)
+{
+    struct vhba_host *vhost;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+    int i;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->cmd_lock, flags);
+
+    vcmd = vhost->commands + vhost->cmd_next++;
+    if (vcmd->status != VHBA_REQ_FREE) {
+        for (i = 0; i < vhba_can_queue; i++) {
+            vcmd = vhost->commands + i;
+
+            if (vcmd->status == VHBA_REQ_FREE) {
+                vhost->cmd_next = i + 1;
+                break;
+            }
+        }
+
+        if (i == vhba_can_queue) {
+            vcmd = NULL;
+        }
+    }
+
+    if (vcmd) {
+        vcmd->status = VHBA_REQ_PENDING;
+    }
+
+    vhost->cmd_next %= vhba_can_queue;
+
+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
+
+    return vcmd;
+}
+
+static void vhba_free_command (struct vhba_command *vcmd)
+{
+    struct vhba_host *vhost;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->cmd_lock, flags);
+    vcmd->status = VHBA_REQ_FREE;
+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
+}
+
+static int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd)
+{
+    struct vhba_device *vdev;
+    int retval;
+    unsigned int devnum;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+    scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag);
+#else
+    scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag);
+#endif
+
+    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
+    vdev = vhba_lookup_device(devnum);
+    if (!vdev) {
+        scmd_dbg(cmd, "no such device\n");
+
+        cmd->result = DID_NO_CONNECT << 16;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+        scsi_done(cmd);
+#else
+        cmd->scsi_done(cmd);
+#endif
+
+        return 0;
+    }
+
+    retval = vhba_device_queue(vdev, cmd);
+
+    vhba_device_put(vdev);
+
+    return retval;
+}
+
+static int vhba_abort (struct scsi_cmnd *cmd)
+{
+    struct vhba_device *vdev;
+    int retval = SUCCESS;
+    unsigned int devnum;
+
+    scmd_dbg(cmd, "abort %p\n", cmd);
+
+    devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id);
+    vdev = vhba_lookup_device(devnum);
+    if (vdev) {
+        retval = vhba_device_dequeue(vdev, cmd);
+        vhba_device_put(vdev);
+    } else {
+        cmd->result = DID_NO_CONNECT << 16;
+    }
+
+    return retval;
+}
+
+static struct scsi_host_template vhba_template = {
+    .module = THIS_MODULE,
+    .name = "vhba",
+    .proc_name = "vhba",
+    .queuecommand = vhba_queuecommand,
+    .eh_abort_handler = vhba_abort,
+    .this_id = -1,
+    .max_sectors = VHBA_MAX_SECTORS_PER_IO,
+    .sg_tablesize = 256,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0)
+    .slave_alloc = vhba_slave_alloc,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
+    .tag_alloc_policy = BLK_TAG_ALLOC_RR,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
+    .use_blk_tags = 1,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0)
+    .max_segment_size = VHBA_KBUF_SIZE,
+#endif
+};
+
+static ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len)
+{
+    struct vhba_request vreq;
+    ssize_t ret;
+
+    scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n",
+        metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd));
+
+    ret = sizeof(vreq);
+    if (DATA_TO_DEVICE(cmd->sc_data_direction)) {
+        ret += scsi_bufflen(cmd);
+    }
+
+    if (ret > buf_len) {
+        scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret);
+        return -EIO;
+    }
+
+    vreq.metatag = metatag;
+    vreq.lun = cmd->device->lun;
+    memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE);
+    vreq.cdb_len = cmd->cmd_len;
+    vreq.data_len = scsi_bufflen(cmd);
+
+    if (copy_to_user(buf, &vreq, sizeof(vreq))) {
+        return -EFAULT;
+    }
+
+    if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) {
+        buf += sizeof(vreq);
+
+        if (scsi_sg_count(cmd)) {
+            unsigned char *kaddr, *uaddr;
+            struct scatterlist *sglist = scsi_sglist(cmd);
+            struct scatterlist *sg;
+            int i;
+
+            uaddr = (unsigned char *) buf;
+
+            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
+                size_t len = sg->length;
+
+                if (len > vdev->kbuf_size) {
+                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
+                    len = vdev->kbuf_size;
+                }
+
+                kaddr = kmap_atomic(sg_page(sg));
+                memcpy(vdev->kbuf, kaddr + sg->offset, len);
+                kunmap_atomic(kaddr);
+
+                if (copy_to_user(uaddr, vdev->kbuf, len)) {
+                    return -EFAULT;
+                }
+                uaddr += len;
+            }
+        } else {
+            if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) {
+                return -EFAULT;
+            }
+        }
+    }
+
+    return ret;
+}
+
+static ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res)
+{
+    ssize_t ret = 0;
+
+    scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n",
+         metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd));
+
+    if (res->status) {
+        if (res->data_len > SCSI_SENSE_BUFFERSIZE) {
+            scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len);
+            res->data_len = SCSI_SENSE_BUFFERSIZE;
+        }
+
+        if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) {
+            return -EFAULT;
+        }
+
+        cmd->result = res->status;
+
+        ret += res->data_len;
+    } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) {
+        size_t to_read;
+
+        if (res->data_len > scsi_bufflen(cmd)) {
+            scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len);
+            res->data_len = scsi_bufflen(cmd);
+        }
+
+        to_read = res->data_len;
+
+        if (scsi_sg_count(cmd)) {
+            unsigned char *kaddr, *uaddr;
+            struct scatterlist *sglist = scsi_sglist(cmd);
+            struct scatterlist *sg;
+            int i;
+
+            uaddr = (unsigned char *)buf;
+
+            for_each_sg(sglist, sg, scsi_sg_count(cmd), i) {
+                size_t len = (sg->length < to_read) ? sg->length : to_read;
+
+                if (len > vdev->kbuf_size) {
+                    scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size);
+                    len = vdev->kbuf_size;
+                }
+
+                if (copy_from_user(vdev->kbuf, uaddr, len)) {
+                    return -EFAULT;
+                }
+                uaddr += len;
+
+                kaddr = kmap_atomic(sg_page(sg));
+                memcpy(kaddr + sg->offset, vdev->kbuf, len);
+                kunmap_atomic(kaddr);
+
+                to_read -= len;
+                if (to_read == 0) {
+                    break;
+                }
+            }
+        } else {
+            if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) {
+                return -EFAULT;
+            }
+
+            to_read -= res->data_len;
+        }
+
+        scsi_set_resid(cmd, to_read);
+
+        ret += res->data_len - to_read;
+    }
+
+    return ret;
+}
+
+static struct vhba_command *next_command (struct vhba_device *vdev)
+{
+    struct vhba_command *vcmd;
+
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->status == VHBA_REQ_PENDING) {
+            break;
+        }
+    }
+
+    if (&vcmd->entry == &vdev->cmd_list) {
+        vcmd = NULL;
+    }
+
+    return vcmd;
+}
+
+static struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag)
+{
+    struct vhba_command *vcmd;
+
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->metatag == metatag) {
+            break;
+        }
+    }
+
+    if (&vcmd->entry == &vdev->cmd_list) {
+        vcmd = NULL;
+    }
+
+    return vcmd;
+}
+
+static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags)
+{
+    struct vhba_command *vcmd;
+    DEFINE_WAIT(wait);
+
+    while (!(vcmd = next_command(vdev))) {
+        if (signal_pending(current)) {
+            break;
+        }
+
+        prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE);
+
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        schedule();
+
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+    }
+
+    finish_wait(&vdev->cmd_wq, &wait);
+    if (vcmd) {
+        vcmd->status = VHBA_REQ_READING;
+    }
+
+    return vcmd;
+}
+
+static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    ssize_t ret;
+    unsigned long flags;
+
+    vdev = file->private_data;
+
+    /* Get next command */
+    if (file->f_flags & O_NONBLOCK) {
+        /* Non-blocking variant */
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+        vcmd = next_command(vdev);
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        if (!vcmd) {
+            return -EWOULDBLOCK;
+        }
+    } else {
+        /* Blocking variant */
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+        vcmd = wait_command(vdev, flags);
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        if (!vcmd) {
+            return -ERESTARTSYS;
+        }
+    }
+
+    ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (ret >= 0) {
+        vcmd->status = VHBA_REQ_SENT;
+        *offset += ret;
+    } else {
+        vcmd->status = VHBA_REQ_PENDING;
+    }
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return ret;
+}
+
+static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    struct vhba_response res;
+    ssize_t ret;
+    unsigned long flags;
+
+    if (buf_len < sizeof(res)) {
+        return -EIO;
+    }
+
+    if (copy_from_user(&res, buf, sizeof(res))) {
+        return -EFAULT;
+    }
+
+    vdev = file->private_data;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    vcmd = match_command(vdev, res.metatag);
+    if (!vcmd || vcmd->status != VHBA_REQ_SENT) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        pr_debug("ctl dev #%u not expecting response\n", vdev->num);
+        return -EIO;
+    }
+    vcmd->status = VHBA_REQ_WRITING;
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (ret >= 0) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+        scsi_done(vcmd->cmd);
+#else
+        vcmd->cmd->scsi_done(vcmd->cmd);
+#endif
+        ret += sizeof(res);
+
+        /* don't compete with vhba_device_dequeue */
+        if (!list_empty(&vcmd->entry)) {
+            list_del_init(&vcmd->entry);
+            vhba_free_command(vcmd);
+        }
+    } else {
+        vcmd->status = VHBA_REQ_SENT;
+    }
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return ret;
+}
+
+static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    struct vhba_device *vdev = file->private_data;
+    struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device);
+
+    switch (cmd) {
+        case 0xBEEF001: {
+            unsigned int ident[4]; /* host, channel, id, lun */
+
+            ident[0] = vhost->shost->host_no;
+            devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]);
+            ident[3] = 0; /* lun */
+
+            if (copy_to_user((void *) arg, ident, sizeof(ident))) {
+                return -EFAULT;
+            }
+
+            return 0;
+        }
+        case 0xBEEF002: {
+            unsigned int devnum = vdev->num;
+
+            if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) {
+                return -EFAULT;
+            }
+
+            return 0;
+        }
+    }
+
+    return -ENOTTY;
+}
+
+#ifdef CONFIG_COMPAT
+static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    unsigned long compat_arg = (unsigned long)compat_ptr(arg);
+    return vhba_ctl_ioctl(file, cmd, compat_arg);
+}
+#endif
+
+static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait)
+{
+    struct vhba_device *vdev = file->private_data;
+    unsigned int mask = 0;
+    unsigned long flags;
+
+    poll_wait(file, &vdev->cmd_wq, wait);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (next_command(vdev)) {
+        mask |= POLLIN | POLLRDNORM;
+    }
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return mask;
+}
+
+static int vhba_ctl_open (struct inode *inode, struct file *file)
+{
+    struct vhba_device *vdev;
+    int retval;
+
+    pr_debug("ctl dev open\n");
+
+    /* check if vhba is probed */
+    if (!platform_get_drvdata(&vhba_platform_device)) {
+        return -ENODEV;
+    }
+
+    vdev = vhba_device_alloc();
+    if (!vdev) {
+        return -ENOMEM;
+    }
+
+    vdev->kbuf_size = VHBA_KBUF_SIZE;
+    vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL);
+    if (!vdev->kbuf) {
+        return -ENOMEM;
+    }
+
+    if (!(retval = vhba_add_device(vdev))) {
+        file->private_data = vdev;
+    }
+
+    vhba_device_put(vdev);
+
+    return retval;
+}
+
+static int vhba_ctl_release (struct inode *inode, struct file *file)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+
+    vdev = file->private_data;
+
+    pr_debug("ctl dev release\n");
+
+    vhba_device_get(vdev);
+    vhba_remove_device(vdev);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING);
+
+        scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd);
+        vcmd->cmd->result = DID_NO_CONNECT << 16;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0)
+        scsi_done(vcmd->cmd);
+#else
+        vcmd->cmd->scsi_done(vcmd->cmd);
+#endif
+        vhba_free_command(vcmd);
+    }
+    INIT_LIST_HEAD(&vdev->cmd_list);
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    kfree(vdev->kbuf);
+    vdev->kbuf = NULL;
+
+    vhba_device_put(vdev);
+
+    return 0;
+}
+
+static struct file_operations vhba_ctl_fops = {
+    .owner = THIS_MODULE,
+    .open = vhba_ctl_open,
+    .release = vhba_ctl_release,
+    .read = vhba_ctl_read,
+    .write = vhba_ctl_write,
+    .poll = vhba_ctl_poll,
+    .unlocked_ioctl = vhba_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+    .compat_ioctl = vhba_ctl_compat_ioctl,
+#endif
+};
+
+static struct miscdevice vhba_miscdev = {
+    .minor = MISC_DYNAMIC_MINOR,
+    .name = "vhba_ctl",
+    .fops = &vhba_ctl_fops,
+};
+
+static int vhba_probe (struct platform_device *pdev)
+{
+    struct Scsi_Host *shost;
+    struct vhba_host *vhost;
+    int i;
+
+    vhba_can_queue = clamp(vhba_can_queue, 1, 256);
+
+    shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host));
+    if (!shost) {
+        return -ENOMEM;
+    }
+
+    shost->max_channel = VHBA_MAX_BUS-1;
+    shost->max_id = VHBA_MAX_ID;
+    /* we don't support lun > 0 */
+    shost->max_lun = 1;
+    shost->max_cmd_len = MAX_COMMAND_SIZE;
+    shost->can_queue = vhba_can_queue;
+    shost->cmd_per_lun = vhba_can_queue;
+
+    vhost = (struct vhba_host *)shost->hostdata;
+    memset(vhost, 0, sizeof(struct vhba_host));
+
+    vhost->shost = shost;
+    vhost->num_devices = 0;
+    spin_lock_init(&vhost->dev_lock);
+    spin_lock_init(&vhost->cmd_lock);
+    INIT_WORK(&vhost->scan_devices, vhba_scan_devices);
+    vhost->cmd_next = 0;
+    vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL);
+    if (!vhost->commands) {
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < vhba_can_queue; i++) {
+        vhost->commands[i].status = VHBA_REQ_FREE;
+    }
+
+    platform_set_drvdata(pdev, vhost);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
+    i = scsi_init_shared_tag_map(shost, vhba_can_queue);
+    if (i) return i;
+#endif
+
+    if (scsi_add_host(shost, &pdev->dev)) {
+        scsi_host_put(shost);
+        return -ENOMEM;
+    }
+
+    return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
+static int vhba_remove (struct platform_device *pdev)
+#else
+static void vhba_remove (struct platform_device *pdev)
+#endif
+{
+    struct vhba_host *vhost;
+    struct Scsi_Host *shost;
+
+    vhost = platform_get_drvdata(pdev);
+    shost = vhost->shost;
+
+    scsi_remove_host(shost);
+    scsi_host_put(shost);
+
+    kfree(vhost->commands);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
+    return 0;
+#endif
+}
+
+static void vhba_release (struct device * dev)
+{
+    return;
+}
+
+static struct platform_device vhba_platform_device = {
+    .name = "vhba",
+    .id = -1,
+    .dev = {
+        .release = vhba_release,
+    },
+};
+
+static struct platform_driver vhba_platform_driver = {
+    .driver = {
+        .owner = THIS_MODULE,
+        .name = "vhba",
+    },
+    .probe = vhba_probe,
+    .remove = vhba_remove,
+};
+
+static int __init vhba_init (void)
+{
+    int ret;
+
+    ret = platform_device_register(&vhba_platform_device);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = platform_driver_register(&vhba_platform_driver);
+    if (ret < 0) {
+        platform_device_unregister(&vhba_platform_device);
+        return ret;
+    }
+
+    ret = misc_register(&vhba_miscdev);
+    if (ret < 0) {
+        platform_driver_unregister(&vhba_platform_driver);
+        platform_device_unregister(&vhba_platform_device);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void __exit vhba_exit(void)
+{
+    misc_deregister(&vhba_miscdev);
+    platform_driver_unregister(&vhba_platform_driver);
+    platform_device_unregister(&vhba_platform_device);
+}
+
+module_init(vhba_init);
+module_exit(vhba_exit);
+
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b1c3db9cf355..e38685ece897 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,6 +206,14 @@ static inline void __mm_zero_struct_page(struct page *page)
 
 extern int sysctl_max_map_count;
 
+extern bool sysctl_workingset_protection;
+extern u8 sysctl_anon_min_ratio;
+extern u8 sysctl_clean_low_ratio;
+extern u8 sysctl_clean_min_ratio;
+int vm_workingset_protection_update_handler(
+	const struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos);
+
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcf0865a38ae..e748afb0ce06 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1365,7 +1365,7 @@ struct readahead_control {
 		._index = i,						\
 	}
 
-#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
+#define VM_READAHEAD_PAGES	(SZ_8M / PAGE_SIZE)
 
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7183e5aca282..56573371a2f8 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -159,6 +159,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
 
 #ifdef CONFIG_USER_NS
 
+extern int unprivileged_userns_clone;
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -192,6 +194,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
 struct ns_common *ns_get_owner(struct ns_common *ns);
 #else
 
+#define unprivileged_userns_clone 0
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6d90ad974408..d04768b01364 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -163,6 +163,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
@@ -1192,6 +1193,7 @@ do {										\
  */
 void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
diff --git a/init/Kconfig b/init/Kconfig
index a20e6efd3f0f..9437171030e2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -154,6 +154,10 @@ config THREAD_INFO_IN_TASK
 
 menu "General setup"
 
+config CACHY
+    bool "Some kernel tweaks by CachyOS"
+    default y
+
 config BROKEN
 	bool
 
@@ -1310,6 +1314,22 @@ config USER_NS
 
 	  If unsure, say N.
 
+config USER_NS_UNPRIVILEGED
+	bool "Allow unprivileged users to create namespaces"
+	default y
+	depends on USER_NS
+	help
+	  When disabled, unprivileged users will not be able to create
+	  new namespaces. Allowing users to create their own namespaces
+	  has been part of several recent local privilege escalation
+	  exploits, so if you need user namespaces but are
+	  paranoid^Wsecurity-conscious you want to disable this.
+
+	  This setting can be overridden at runtime via the
+	  kernel.unprivileged_userns_clone sysctl.
+
+	  If unsure, say Y.
+
 config PID_NS
 	bool "PID Namespaces"
 	default y
@@ -1452,6 +1472,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
 	  with the "-O2" compiler flag for best performance and most
 	  helpful compile-time warnings.
 
+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
+	bool "Optimize more for performance (-O3)"
+	help
+	  Choosing this option will pass "-O3" to your compiler to optimize
+	  the kernel yet more for performance.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size (-Os)"
 	help
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 38ef6d06888e..0f78364efd4f 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -40,6 +40,27 @@ choice
 	 on SMP and NUMA systems and exactly dividing by both PAL and
 	 NTSC frame rates for video and multimedia work.
 
+	config HZ_500
+		bool "500 HZ"
+	help
+	 500 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_600
+		bool "600 HZ"
+	help
+	 600 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_750
+		bool "750 HZ"
+	help
+	 750 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
 	config HZ_1000
 		bool "1000 HZ"
 	help
@@ -53,6 +74,9 @@ config HZ
 	default 100 if HZ_100
 	default 250 if HZ_250
 	default 300 if HZ_300
+	default 500 if HZ_500
+	default 600 if HZ_600
+	default 750 if HZ_750
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 54ea59ff8fbe..18f87e0dd137 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -88,7 +88,7 @@ endchoice
 
 config PREEMPT_RT
 	bool "Fully Preemptible Kernel (Real-Time)"
-	depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST
+	depends on ARCH_SUPPORTS_RT && !COMPILE_TEST
 	select PREEMPTION
 	help
 	  This option turns the kernel into a real-time kernel by replacing
diff --git a/kernel/fork.c b/kernel/fork.c
index 9da032802e34..32bf5e30ba4a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -106,6 +106,10 @@
 #include <linux/pidfs.h>
 #include <linux/tick.h>
 
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
+
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -2169,6 +2173,10 @@ __latent_entropy struct task_struct *copy_process(
 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
+	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
+		if (!capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+
 	/*
 	 * Thread groups must share signals as well, and detached threads
 	 * can only be started up within the thread group.
@@ -3322,6 +3330,12 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
 
+	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
+		err = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto bad_unshare_out;
+	}
+
 	err = check_unshare_flags(unshare_flags);
 	if (err)
 		goto bad_unshare_out;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2ddb827e3bea..464049c4af3f 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -747,6 +747,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
 	struct task_struct *new, *owner;
 	unsigned long flags, new_flags;
 	enum owner_state state;
+	int i = 0;
 
 	lockdep_assert_preemption_disabled();
 
@@ -783,7 +784,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
 			break;
 		}
 
-		cpu_relax();
+		if (i++ > 1000)
+			cpu_relax();
 	}
 
 	return state;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d0a05660e5e..3a3116dca89c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  *
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_base_slice			= 350000ULL;
+static unsigned int normalized_sysctl_sched_base_slice	= 350000ULL;
+#else
 unsigned int sysctl_sched_base_slice			= 750000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
+#endif
 
+#ifdef CONFIG_CACHY
+const_debug unsigned int sysctl_sched_migration_cost	= 300000UL;
+#else
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+#endif
 
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
+#ifdef CONFIG_CACHY
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+#else
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
+#endif
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f3e121888d05..18f3955ddb8f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2820,7 +2820,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
-#ifdef CONFIG_PREEMPT_RT
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY)
 # define SCHED_NR_MIGRATE_BREAK 8
 #else
 # define SCHED_NR_MIGRATE_BREAK 32
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..c5cc616484ba 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -47,6 +47,17 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
 }
 EXPORT_SYMBOL_GPL(add_wait_queue_priority);
 
+void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+	unsigned long flags;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__add_wait_queue(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_lifo);
+
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
@@ -258,6 +269,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
+void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
+{
+	unsigned long flags;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&wq_head->lock, flags);
+	if (list_empty(&wq_entry->entry))
+		__add_wait_queue(wq_head, wq_entry);
+	set_current_state(state);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo);
+
 void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
 {
 	wq_entry->flags = flags;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5c9202cb8f59..de4ddf79fe97 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,6 +80,9 @@
 #ifdef CONFIG_RT_MUTEXES
 #include <linux/rtmutex.h>
 #endif
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
 
 /* shared constants to be used in various sysctls */
 const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
@@ -1617,6 +1620,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_USER_NS
+	{
+		.procname	= "unprivileged_userns_clone",
+		.data		= &unprivileged_userns_clone,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
@@ -2197,6 +2209,40 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 	},
 #endif
+	{
+		.procname	= "workingset_protection",
+		.data		= &sysctl_workingset_protection,
+		.maxlen		= sizeof(bool),
+		.mode		= 0644,
+		.proc_handler	= &proc_dobool,
+	},
+	{
+		.procname	= "anon_min_ratio",
+		.data		= &sysctl_anon_min_ratio,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= &vm_workingset_protection_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{
+		.procname	= "clean_low_ratio",
+		.data		= &sysctl_clean_low_ratio,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= &vm_workingset_protection_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
+	{
+		.procname	= "clean_min_ratio",
+		.data		= &sysctl_clean_min_ratio,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= &vm_workingset_protection_update_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
 	{
 		.procname	= "user_reserve_kbytes",
 		.data		= &sysctl_user_reserve_kbytes,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa0b2e47f2f2..d74d857b1696 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,6 +22,13 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 
+/* sysctl */
+#ifdef CONFIG_USER_NS_UNPRIVILEGED
+int unprivileged_userns_clone = 1;
+#else
+int unprivileged_userns_clone;
+#endif
+
 static struct kmem_cache *user_ns_cachep __ro_after_init;
 static DEFINE_MUTEX(userns_state_mutex);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 84000b016808..1d96c5cc35d3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -499,6 +499,69 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 	bool
 
+config ANON_MIN_RATIO
+	int "Default value for vm.anon_min_ratio"
+	depends on SYSCTL
+	range 0 100
+	default 15
+	help
+	  This option sets the default value for vm.anon_min_ratio sysctl knob.
+
+	  The vm.anon_min_ratio sysctl knob provides *hard* protection of
+	  anonymous pages. The anonymous pages on the current node won't be
+	  reclaimed under any conditions when their amount is below
+	  vm.anon_min_ratio. This knob may be used to prevent excessive swap
+	  thrashing when anonymous memory is low (for example, when memory is
+	  going to be overfilled by compressed data of zram module).
+
+	  Setting this value too high (close to MemTotal) can result in
+	  inability to swap and can lead to early OOM under memory pressure.
+
+config CLEAN_LOW_RATIO
+	int "Default value for vm.clean_low_ratio"
+	depends on SYSCTL
+	range 0 100
+	default 0
+	help
+	  This option sets the default value for vm.clean_low_ratio sysctl knob.
+
+	  The vm.clean_low_ratio sysctl knob provides *best-effort*
+	  protection of clean file pages. The file pages on the current node
+	  won't be reclaimed under memory pressure when the amount of clean file
+	  pages is below vm.clean_low_ratio *unless* we threaten to OOM.
+	  Protection of clean file pages using this knob may be used when
+	  swapping is still possible to
+	    - prevent disk I/O thrashing under memory pressure;
+	    - improve performance in disk cache-bound tasks under memory
+	      pressure.
+
+	  Setting it to a high value may result in a early eviction of anonymous
+	  pages into the swap space by attempting to hold the protected amount
+	  of clean file pages in memory.
+
+config CLEAN_MIN_RATIO
+	int "Default value for vm.clean_min_ratio"
+	depends on SYSCTL
+	range 0 100
+	default 15
+	help
+	  This option sets the default value for vm.clean_min_ratio sysctl knob.
+
+	  The vm.clean_min_ratio sysctl knob provides *hard* protection of
+	  clean file pages. The file pages on the current node won't be
+	  reclaimed under memory pressure when the amount of clean file pages is
+	  below vm.clean_min_ratio. Hard protection of clean file pages using
+	  this knob may be used to
+	    - prevent disk I/O thrashing under memory pressure even with no free
+	      swap space;
+	    - improve performance in disk cache-bound tasks under memory
+	      pressure;
+	    - avoid high latency and prevent livelock in near-OOM conditions.
+
+	  Setting it to a high value may result in a early out-of-memory condition
+	  due to the inability to reclaim the protected amount of clean file pages
+	  when other types of pages cannot be reclaimed.
+
 config HAVE_MEMBLOCK_PHYS_MAP
 	bool
 
@@ -648,7 +711,7 @@ config COMPACTION
 config COMPACT_UNEVICTABLE_DEFAULT
 	int
 	depends on COMPACTION
-	default 0 if PREEMPT_RT
+	default 0 if PREEMPT_RT || CACHY
 	default 1
 
 #
diff --git a/mm/compaction.c b/mm/compaction.c
index 384e4672998e..d122cbf8aa9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1921,7 +1921,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE
  * aggressively the kernel should compact memory in the
  * background. It takes values in the range [0, 100].
  */
+#ifdef CONFIG_CACHY
+static unsigned int __read_mostly sysctl_compaction_proactiveness;
+#else
 static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+#endif
 static int sysctl_extfrag_threshold = 500;
 static int __read_mostly sysctl_compact_memory;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index db64116a4f84..3e0266c973e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -64,7 +64,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
+#ifdef CONFIG_CACHY
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
+#else
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
+#endif
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 24b68b425afb..081ddb92db87 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2630,6 +2630,7 @@ static void __init mem_init_print_info(void)
 		, K(totalhigh_pages())
 #endif
 		);
+	printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.8 by Masahito Suzuki (forked from hakavlad's original le9 patch)");
 }
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d9861e42b2bd..13ab2294f0bb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
+#ifdef CONFIG_CACHY
+static int dirty_background_ratio = 5;
+#else
 static int dirty_background_ratio = 10;
+#endif
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes;
 /*
  * The interval between `kupdate'-style writebacks
  */
+#ifdef CONFIG_CACHY
+unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
+#else
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+#endif
 
 EXPORT_SYMBOL_GPL(dirty_writeback_interval);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 01eab25edf89..3ea393f1311a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -271,7 +271,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+#ifdef CONFIG_CACHY
+static int watermark_boost_factor __read_mostly;
+#else
 static int watermark_boost_factor __read_mostly = 15000;
+#endif
 static int watermark_scale_factor = 10;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
diff --git a/mm/swap.c b/mm/swap.c
index 10decd9dffa1..98cc35d1600e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1049,6 +1049,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
  */
 void __init swap_setup(void)
 {
+#ifdef CONFIG_CACHY
+	/* Only swap-in pages requested, avoid readahead */
+	page_cluster = 0;
+#else
 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
 
 	/* Use a smaller cluster for small-memory machines */
@@ -1060,4 +1064,5 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#endif
 }
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index bd5183dfd879..3a410f53a07c 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
  * essence, they are percents: the higher the value, the more number
  * unsuccessful reclaims there were.
  */
+#ifdef CONFIG_CACHY
+static const unsigned int vmpressure_level_med = 65;
+#else
 static const unsigned int vmpressure_level_med = 60;
+#endif
 static const unsigned int vmpressure_level_critical = 95;
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c22175120f5d..9c6451e872af 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -148,6 +148,15 @@ struct scan_control {
 	/* The file folios on the current node are dangerously low */
 	unsigned int file_is_tiny:1;
 
+	/* The anonymous pages on the current node are below vm.anon_min_ratio */
+	unsigned int anon_below_min:1;
+
+	/* The clean file pages on the current node are below vm.clean_low_ratio */
+	unsigned int clean_below_low:1;
+
+	/* The clean file pages on the current node are below vm.clean_min_ratio */
+	unsigned int clean_below_min:1;
+
 	/* Always discard instead of demoting to lower tier memory */
 	unsigned int no_demotion:1;
 
@@ -197,10 +206,23 @@ struct scan_control {
 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
 #endif
 
+bool sysctl_workingset_protection __read_mostly = true;
+u8 sysctl_anon_min_ratio  __read_mostly = CONFIG_ANON_MIN_RATIO;
+u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO;
+u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO;
+static u64 sysctl_anon_min_ratio_kb  __read_mostly = 0;
+static u64 sysctl_clean_low_ratio_kb __read_mostly = 0;
+static u64 sysctl_clean_min_ratio_kb __read_mostly = 0;
+static u64 workingset_protection_prev_totalram __read_mostly = 0;
+
 /*
  * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
  */
+#ifdef CONFIG_CACHY
+int vm_swappiness = 100;
+#else
 int vm_swappiness = 60;
+#endif
 
 #ifdef CONFIG_MEMCG
 
@@ -1097,6 +1119,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 		    folio_mapped(folio) && folio_test_referenced(folio))
 			goto keep_locked;
 
+		if (folio_is_file_lru(folio) ? sc->clean_below_min :
+				(sc->anon_below_min && !sc->clean_below_min))
+			goto keep_locked;
+
 		/*
 		 * The number of dirty pages determines if a node is marked
 		 * reclaim_congested. kswapd will stall and start writing
@@ -2424,6 +2450,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 		goto out;
 	}
 
+	/*
+	 * Force-scan anon if clean file pages is under vm.clean_low_ratio
+	 * or vm.clean_min_ratio.
+	 */
+	if (sc->clean_below_low || sc->clean_below_min) {
+		scan_balance = SCAN_ANON;
+		goto out;
+	}
+
 	/*
 	 * If there is enough inactive page cache, we do not reclaim
 	 * anything from the anonymous working right now.
@@ -2568,6 +2603,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 			BUG();
 		}
 
+		/*
+		 * Hard protection of the working set.
+		 * Don't reclaim anon/file pages when the amount is
+		 * below the watermark of the same type.
+		 */
+		if (file ? sc->clean_below_min : sc->anon_below_min)
+			scan = 0;
+
 		nr[lru] = scan;
 	}
 }
@@ -2587,6 +2630,96 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 	return can_demote(pgdat->node_id, sc);
 }
 
+int vm_workingset_protection_update_handler(const struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	workingset_protection_prev_totalram = 0;
+
+	return 0;
+}
+
+static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc)
+{
+	unsigned long node_mem_total;
+	struct sysinfo i;
+
+	if (!(sysctl_workingset_protection)) {
+		sc->anon_below_min = 0;
+		sc->clean_below_low = 0;
+		sc->clean_below_min = 0;
+		return;
+	}
+
+	if (likely(sysctl_anon_min_ratio  ||
+	           sysctl_clean_low_ratio ||
+		       sysctl_clean_min_ratio)) {
+#ifdef CONFIG_NUMA
+		si_meminfo_node(&i, pgdat->node_id);
+#else //CONFIG_NUMA
+		si_meminfo(&i);
+#endif //CONFIG_NUMA
+		node_mem_total = i.totalram;
+
+		if (unlikely(workingset_protection_prev_totalram != node_mem_total)) {
+			sysctl_anon_min_ratio_kb  =
+				node_mem_total * sysctl_anon_min_ratio  / 100;
+			sysctl_clean_low_ratio_kb =
+				node_mem_total * sysctl_clean_low_ratio / 100;
+			sysctl_clean_min_ratio_kb =
+				node_mem_total * sysctl_clean_min_ratio / 100;
+			workingset_protection_prev_totalram = node_mem_total;
+		}
+	}
+
+	/*
+	 * Check the number of anonymous pages to protect them from
+	 * reclaiming if their amount is below the specified.
+	 */
+	if (sysctl_anon_min_ratio) {
+		unsigned long reclaimable_anon;
+
+		reclaimable_anon =
+			node_page_state(pgdat, NR_ACTIVE_ANON) +
+			node_page_state(pgdat, NR_INACTIVE_ANON) +
+			node_page_state(pgdat, NR_ISOLATED_ANON);
+
+		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb;
+	} else
+		sc->anon_below_min = 0;
+
+	/*
+	 * Check the number of clean file pages to protect them from
+	 * reclaiming if their amount is below the specified.
+	 */
+	if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) {
+		unsigned long reclaimable_file, dirty, clean;
+
+		reclaimable_file =
+			node_page_state(pgdat, NR_ACTIVE_FILE) +
+			node_page_state(pgdat, NR_INACTIVE_FILE) +
+			node_page_state(pgdat, NR_ISOLATED_FILE);
+		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
+		/*
+		 * node_page_state() sum can go out of sync since
+		 * all the values are not read at once.
+		 */
+		if (likely(reclaimable_file > dirty))
+			clean = reclaimable_file - dirty;
+		else
+			clean = 0;
+
+		sc->clean_below_low = clean < sysctl_clean_low_ratio_kb;
+		sc->clean_below_min = clean < sysctl_clean_min_ratio_kb;
+	} else {
+		sc->clean_below_low = 0;
+		sc->clean_below_min = 0;
+	}
+}
+
 #ifdef CONFIG_LRU_GEN
 
 #ifdef CONFIG_LRU_GEN_ENABLED
@@ -4540,6 +4673,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 	 */
 	if (!swappiness)
 		type = LRU_GEN_FILE;
+	else if (sc->clean_below_min)
+		type = LRU_GEN_ANON;
+	else if (sc->anon_below_min)
+		type = LRU_GEN_FILE;
+	else if (sc->clean_below_low)
+		type = LRU_GEN_ANON;
 	else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
 		type = LRU_GEN_ANON;
 	else if (swappiness == 1)
@@ -4830,6 +4969,8 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
+	prepare_workingset_protection(pgdat, sc);
+
 	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
 	if (mem_cgroup_below_min(NULL, memcg))
 		return MEMCG_LRU_YOUNG;
@@ -5978,6 +6119,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 	prepare_scan_control(pgdat, sc);
 
+	prepare_workingset_protection(pgdat, sc);
+
 	shrink_node_memcgs(pgdat, sc);
 
 	flush_reclaim_state(sc);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6872b5aff73e..1910fe1b2471 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -634,7 +634,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 	 * having to remove and re-insert us on the wait queue.
 	 */
 	for (;;) {
-		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+		prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait,
 					  TASK_INTERRUPTIBLE);
 		release_sock(sk);
 		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
-- 
2.48.0.rc1

