shithub: dav1d

Download patch

ref: 71f27407dd4d16f1ce2ce2ea9071a1b0816c2c27
parent: 42af404efe19f04e58314b20f82df5277b60ff01
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Fri Apr 3 08:35:54 EDT 2020

x86: add some explanatory comment to wiener_filter_h

Explains how the clipping to the range defined in the spec works.

--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -169,14 +169,21 @@
     paddw         m2, m4
     paddw         m0, m6
     paddw         m2, m5
-    paddsw        m0, m8
+    ; for a signed overflow to happen we need filter and pixels as follow:
+    ; filter => -5,-23,-17,90,-17,-23,-5
+    ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
+    ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
+    ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
+    ;  32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
+    ; => signed 16-bit overflow occurs
+    paddsw        m0, m8  ; paddsw clips this range to [-8000;+7FFF]
     paddsw        m2, m3
-    psraw         m0, 3
+    psraw         m0, 3   ; shift changes the range to [-1000;+FFF]
     psraw         m2, 3
-    paddw         m0, m11
-    paddw         m2, m11
-    mova   [dstptrq], xm0
-    mova [dstptrq+16], xm2
+    paddw         m0, m11 ; adding back 800 (removed in m8) changes the
+    paddw         m2, m11 ; range to [-800;+17FF] as defined in the spec
+    mova   [dstptrq], xm0 ; (note that adding another 800 would give us
+    mova [dstptrq+16], xm2;  the same range as in the C code => [0;1FFF])
     vextracti128 [dstptrq+32], m0, 1
     vextracti128 [dstptrq+48], m2, 1
     vextracti128 xm0, m1, 1
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -359,8 +359,8 @@
     paddw         m2, m4
     paddw         m0, m3
     paddw         m2, m5
-    paddsw        m0, m8
-    paddsw        m2, m6
+    paddsw        m0, m8 ; see the avx2 for an explanation
+    paddsw        m2, m6 ; of how the clipping works here
     psraw         m0, 3
     psraw         m2, 3
     paddw         m0, m11