Fix __m128/__m128i type mismatch in tint0r SSE4.1 path (GCC 16).

tint_sse41() reuses the same __m128 variables to hold both float vectors
and the integer results of _mm_cvtps_epi32 / _mm_loadu_si128 / packus.
__m128 and __m128i are distinct types; GCC 16 rejects the implicit
assignments (older/looser compilers silently reinterpreted the bits).
Split the integer-typed values into their own __m128i variables; the
actual float<->int conversions are still done by the cvt intrinsics, so
behaviour is unchanged. This path is only compiled when __SSE4_1__ is set
(e.g. -march=znver5). Not fixed upstream as of v3.1.3.

--- a/src/filter/tint0r/tint0r.c	2026-06-07 00:10:20.271020425 -0300
+++ b/src/filter/tint0r/tint0r.c	2026-06-07 00:10:20.275223565 -0300
@@ -186,7 +186,11 @@
   tmp0 = _mm_mul_ps(cdelta, sse_amount),
   tmp1 = _mm_mul_ps(_mm_mul_ps(sse_amount, _mm_set1_ps(255.0)), cmin);
 
-  __m128 p, p0, p1, p2, p3, luma;
+  /* p and the q* hold integer pixel data; p0..p3/luma are float. GCC 16
+     rejects mixing __m128 and __m128i, so keep them as distinct types. */
+  __m128i p;
+  __m128 p0, p1, p2, p3, luma;
+  __m128i q0, q1, q2, q3;
 
   // Process pixels in groups of 4
   for (size_t i = 0; i < len; i++)
@@ -200,17 +204,17 @@
     p2 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(p, 8)));
     p3 = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(p, 12)));
 
-    #define tint(v) \
+    #define tint(v, q) \
       luma = _mm_dp_ps((v), weights, 0x7F); \
-      v = _mm_add_ps(_mm_mul_ps(comp_amount, (v)), \
+      (v) = _mm_add_ps(_mm_mul_ps(comp_amount, (v)), \
                      _mm_add_ps(_mm_mul_ps(luma, tmp0), tmp1)); \
-      v = _mm_cvtps_epi32(v)
+      (q) = _mm_cvtps_epi32(v)
 
-    tint(p0); tint(p1); tint(p2); tint(p3);
+    tint(p0, q0); tint(p1, q1); tint(p2, q2); tint(p3, q3);
 
     /* Gather the processed pixels */
-    p = _mm_packus_epi16(_mm_packus_epi32(p0, p1),
-                         _mm_packus_epi32(p2, p3));
+    p = _mm_packus_epi16(_mm_packus_epi32(q0, q1),
+                         _mm_packus_epi32(q2, q3));
 
     _mm_storeu_si128((__m128i*)(outframe + i * 4), p);
   }
