SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome

2 years ago · 84a5859541
1 changed files with 90 additions and 44 deletions
--- a/client/wasm/lod.c
+++ b/client/wasm/lod.c
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-// clang -g -Wall -Wextra -O3 -Wl,--export-all,--no-entry --target=wasm32 -Xclang -target-feature -Xclang +simd128 lod.c -nostdlib -o lod.wasm
+// clang -Oz --target=wasm32 -nostdlib -Wl,--export-all,--no-entry -msimd128 lod.c -o lod.wasm

 #include <wasm_simd128.h>

@ -39,16 +39,20 @@ static int
				@@ -39,16 +39,20 @@ static int
 rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from, 
             int segment_start, int segment_end)
 {
-    float EPS = 0.125f / zoom * 255.0f;
-    
    int result = -1;
+
+    if (segment_start == segment_end) {
+        return(result);
+    }
+
+    float EPS = 0.125f / zoom * 255.0f;
    float max_dist = 0.0f;
-    
+
    float ax = xs[coords_from + segment_start];
    float ay = ys[coords_from + segment_start];
    float bx = xs[coords_from + segment_end];
    float by = ys[coords_from + segment_end];
-   
+
    unsigned char ap = pressures[coords_from / 2 + segment_start];
    unsigned char bp = pressures[coords_from / 2 + segment_end];

@ -58,75 +62,117 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
				@@ -58,75 +62,117 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
    float dist_ab = __builtin_sqrtf(dx * dx + dy * dy);
    float dir_nx = dy / dist_ab * 255.0f;
    float dir_ny = -dx / dist_ab * 255.0f;
-
-#if 0
-    v128_t scale_255 = wasm_f32x4_splat(1.0f / 255.0f);
-    v128_t EPSs = wasm_f32x4_splat(EPS);
-#endif
-    
+#if 1
    for (int i = segment_start + 1; i < segment_end; ++i) {
-#if 0
-        v128_t pxs = wasm_v128_load(coordinates_x + coords_from + i);
-        v128_t pxs = wasm_v128_load(coordinates_y + coords_from + i);
+        float px = xs[coords_from + i];
+        float py = ys[coords_from + i];
+        
+        unsigned char pp = pressures[coords_from + i];
+
+        float apx = px - ax;
+        float apy = py - ay;
+        
+        float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
+            + __builtin_abs(pp - ap) + __builtin_abs(pp - bp);
+        
+        if (dist > EPS && dist > max_dist) {
+            result = i;
+            max_dist = dist;
+        }
+    }
+#else
+    v128_t eps_x4 = wasm_f32x4_splat(EPS);
+    v128_t ax_x4 = wasm_f32x4_splat(ax);
+    v128_t ay_x4 = wasm_f32x4_splat(ay);
+    v128_t ap_x4 = wasm_f32x4_splat(ap);
+    v128_t bp_x4 = wasm_f32x4_splat(bp);
+    v128_t dir_nx_x4 = wasm_f32x4_splat(dir_nx);
+    v128_t dir_ny_x4 = wasm_f32x4_splat(dir_ny);
+
+    v128_t index_x4 = wasm_u32x4_make(segment_start + 1, segment_start + 2, segment_start + 3, segment_start + 4);
+    v128_t four_x4 = wasm_u32x4_const_splat(4);
+    v128_t max_vals_x4 = wasm_f32x4_const_splat(0.0f);
+    v128_t max_index_x4 = wasm_u32x4_const_splat(-1);
+
+    for (int i = segment_start + 1; i < segment_end - 3; i += 4) {
+        v128_t px_x4 = wasm_v128_load(xs + coords_from + i);
+        v128_t py_x4 = wasm_v128_load(ys + coords_from + i);
+        v128_t pp_x16 = wasm_v128_load(pressures + coords_from / 2 + i);

-        v128_t pps = wasm_v128_load(pressures + coords_from + i);
+        // Take 4 highest bytes and convert to float
+        v128_t pp_x8 = wasm_u16x8_extend_high_u8x16(pp_x16);
+        v128_t pp_x4i = wasm_u32x4_extend_high_u16x8(pp_x8);
+        v128_t pp_x4 = wasm_f32x4_convert_i32x4(pp_x4i); // i version is 8 times faster on x64?

-        v128_t apxs = wasm_f32x4_sub(pxs, axs);
-        v128_t apys = wasm_f32x4_sub(pys, ays);
+        v128_t apx_x4 = wasm_f32x4_sub(px_x4, ax_x4);
+        v128_t apy_x4 = wasm_f32x4_sub(py_x4, ay_x4);

-        v128_t dists = wasm_f32x4_add(
+        v128_t dist_x4 = wasm_f32x4_add(
            wasm_f32x4_add(
-                wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, aps)), scale_255),
-                wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, bps)), scale_255)
+                wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, ap_x4)),
+                wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, bp_x4))
            ),
            wasm_f32x4_abs(
                wasm_f32x4_add(
-                    wasm_f32x4_mul(apxs, dir_nxs),
-                    wasm_f32x4_mul(apys, dir_nys)
+                    wasm_f32x4_mul(apx_x4, dir_nx_x4),
+                    wasm_f32x4_mul(apy_x4, dir_ny_x4)
                )
            )
        );
+        
+        v128_t dist_mask = wasm_f32x4_gt(dist_x4, eps_x4);
+        v128_t max_mask = wasm_f32x4_gt(dist_x4, max_vals_x4);
+        v128_t mask = wasm_v128_and(dist_mask, max_mask);

-        v128_t dist_mask = wasm_f32x4_gt(dists, EPSs);
-        v128_t max_mask = wasm_f32x4_gt(dists, max_dists);
-        v128_t final_mask = wasm_v128_and(dist_mask, max_mask);
-
-        if (!wasm_v128_any_true(final_mask)) {
-            // fast path? hopefully?
-            continue;
-        }
-
-        // Places max(0, 2) and max(1, 3) into lanes (0, 1)
-        v128_t max_02_13 = wasm_f32x4_max(
-            dists,  
-            wasm_i32x4_shuffle(dists, dists, 2, 3, 2, 3)
+        max_index_x4 = wasm_v128_or(
+            wasm_v128_and(index_x4, mask),
+            wasm_v128_andnot(max_index_x4, mask)
        );

-        // Places max(max(0, 2), max(1, 3)) into lane 0
-        v128_t max_0123 = wasm_f32x4_max(
-            max_02_13,
-            wasm_i32x4_shuffle(max_02_13, max_02_13, 1, 1, 1, 1)
+        max_vals_x4 = wasm_v128_or(
+            wasm_v128_and(dist_x4, mask),
+            wasm_v128_andnot(max_vals_x4, mask)
        );
+        
+        index_x4 = wasm_i32x4_add(index_x4, four_x4);
+    }

-        float final_max = wasm_f32x4_extract_lane(max_0123, 0);
-#endif
+    int indices[4];
+    float values[4];
+
+    wasm_v128_store(indices, max_index_x4);
+    wasm_v128_store(values, max_vals_x4);

+    for (int i = 0; i < 4; ++i) {
+        if (indices[i] != -1) {
+            if (values[i] > max_dist) {
+                result = indices[i];
+                max_dist = values[i];
+            }
+        }
+    }
+
+    int remainder = (segment_end - segment_start - 1) % 4;
+
+    for (int i = segment_end - remainder; i < segment_end; ++i) {
        float px = xs[coords_from + i];
        float py = ys[coords_from + i];
-        
+
        unsigned char pp = pressures[coords_from + i];

        float apx = px - ax;
        float apy = py - ay;
-        
+
        float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
            + __builtin_abs(pp - ap) + __builtin_abs(pp - bp);
-        
+
        if (dist > EPS && dist > max_dist) {
            result = i;
            max_dist = dist;
        }
    }
+
+#endif
    
    return(result);
 }