diff --git a/client/wasm/lod.c b/client/wasm/lod.c index 2c3420d..28421fc 100644 --- a/client/wasm/lod.c +++ b/client/wasm/lod.c @@ -1,4 +1,4 @@ -// clang -g -Wall -Wextra -O3 -Wl,--export-all,--no-entry --target=wasm32 -Xclang -target-feature -Xclang +simd128 lod.c -nostdlib -o lod.wasm +// clang -Oz --target=wasm32 -nostdlib -Wl,--export-all,--no-entry -msimd128 lod.c -o lod.wasm #include @@ -39,16 +39,20 @@ static int rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from, int segment_start, int segment_end) { - float EPS = 0.125f / zoom * 255.0f; - int result = -1; + + if (segment_start == segment_end) { + return(result); + } + + float EPS = 0.125f / zoom * 255.0f; float max_dist = 0.0f; - + float ax = xs[coords_from + segment_start]; float ay = ys[coords_from + segment_start]; float bx = xs[coords_from + segment_end]; float by = ys[coords_from + segment_end]; - + unsigned char ap = pressures[coords_from / 2 + segment_start]; unsigned char bp = pressures[coords_from / 2 + segment_end]; @@ -58,75 +62,117 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo float dist_ab = __builtin_sqrtf(dx * dx + dy * dy); float dir_nx = dy / dist_ab * 255.0f; float dir_ny = -dx / dist_ab * 255.0f; - -#if 0 - v128_t scale_255 = wasm_f32x4_splat(1.0f / 255.0f); - v128_t EPSs = wasm_f32x4_splat(EPS); -#endif - +#if 1 for (int i = segment_start + 1; i < segment_end; ++i) { -#if 0 - v128_t pxs = wasm_v128_load(coordinates_x + coords_from + i); - v128_t pxs = wasm_v128_load(coordinates_y + coords_from + i); + float px = xs[coords_from + i]; + float py = ys[coords_from + i]; + + unsigned char pp = pressures[coords_from + i]; + + float apx = px - ax; + float apy = py - ay; + + float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) + + __builtin_abs(pp - ap) + __builtin_abs(pp - bp); + + if (dist > EPS && dist > max_dist) { + result = i; + max_dist = dist; + } + } +#else + v128_t eps_x4 = wasm_f32x4_splat(EPS); + v128_t ax_x4 = wasm_f32x4_splat(ax); + v128_t ay_x4 = wasm_f32x4_splat(ay); + v128_t ap_x4 = wasm_f32x4_splat(ap); + v128_t bp_x4 = wasm_f32x4_splat(bp); + v128_t dir_nx_x4 = wasm_f32x4_splat(dir_nx); + v128_t dir_ny_x4 = wasm_f32x4_splat(dir_ny); + + v128_t index_x4 = wasm_u32x4_make(segment_start + 1, segment_start + 2, segment_start + 3, segment_start + 4); + v128_t four_x4 = wasm_u32x4_const_splat(4); + v128_t max_vals_x4 = wasm_f32x4_const_splat(0.0f); + v128_t max_index_x4 = wasm_u32x4_const_splat(-1); + + for (int i = segment_start + 1; i < segment_end - 3; i += 4) { + v128_t px_x4 = wasm_v128_load(xs + coords_from + i); + v128_t py_x4 = wasm_v128_load(ys + coords_from + i); + v128_t pp_x16 = wasm_v128_load(pressures + coords_from / 2 + i); - v128_t pps = wasm_v128_load(pressures + coords_from + i); + // Take 4 highest bytes and convert to float + v128_t pp_x8 = wasm_u16x8_extend_high_u8x16(pp_x16); + v128_t pp_x4i = wasm_u32x4_extend_high_u16x8(pp_x8); + v128_t pp_x4 = wasm_f32x4_convert_i32x4(pp_x4i); // i version is 8 times faster on x64? - v128_t apxs = wasm_f32x4_sub(pxs, axs); - v128_t apys = wasm_f32x4_sub(pys, ays); + v128_t apx_x4 = wasm_f32x4_sub(px_x4, ax_x4); + v128_t apy_x4 = wasm_f32x4_sub(py_x4, ay_x4); - v128_t dists = wasm_f32x4_add( + v128_t dist_x4 = wasm_f32x4_add( wasm_f32x4_add( - wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, aps)), scale_255), - wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, bps)), scale_255) + wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, ap_x4)), + wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, bp_x4)) ), wasm_f32x4_abs( wasm_f32x4_add( - wasm_f32x4_mul(apxs, dir_nxs), - wasm_f32x4_mul(apys, dir_nys) + wasm_f32x4_mul(apx_x4, dir_nx_x4), + wasm_f32x4_mul(apy_x4, dir_ny_x4) ) ) ); + + v128_t dist_mask = wasm_f32x4_gt(dist_x4, eps_x4); + v128_t max_mask = wasm_f32x4_gt(dist_x4, max_vals_x4); + v128_t mask = wasm_v128_and(dist_mask, max_mask); - v128_t dist_mask = wasm_f32x4_gt(dists, EPSs); - v128_t max_mask = wasm_f32x4_gt(dists, max_dists); - v128_t final_mask = wasm_v128_and(dist_mask, max_mask); - - if (!wasm_v128_any_true(final_mask)) { - // fast path? hopefully? - continue; - } - - // Places max(0, 2) and max(1, 3) into lanes (0, 1) - v128_t max_02_13 = wasm_f32x4_max( - dists, - wasm_i32x4_shuffle(dists, dists, 2, 3, 2, 3) + max_index_x4 = wasm_v128_or( + wasm_v128_and(index_x4, mask), + wasm_v128_andnot(max_index_x4, mask) ); - // Places max(max(0, 2), max(1, 3)) into lane 0 - v128_t max_0123 = wasm_f32x4_max( - max_02_13, - wasm_i32x4_shuffle(max_02_13, max_02_13, 1, 1, 1, 1) + max_vals_x4 = wasm_v128_or( + wasm_v128_and(dist_x4, mask), + wasm_v128_andnot(max_vals_x4, mask) ); + + index_x4 = wasm_i32x4_add(index_x4, four_x4); + } - float final_max = wasm_f32x4_extract_lane(max_0123, 0); -#endif + int indices[4]; + float values[4]; + + wasm_v128_store(indices, max_index_x4); + wasm_v128_store(values, max_vals_x4); + for (int i = 0; i < 4; ++i) { + if (indices[i] != -1) { + if (values[i] > max_dist) { + result = indices[i]; + max_dist = values[i]; + } + } + } + + int remainder = (segment_end - segment_start - 1) % 4; + + for (int i = segment_end - remainder; i < segment_end; ++i) { float px = xs[coords_from + i]; float py = ys[coords_from + i]; - + unsigned char pp = pressures[coords_from + i]; float apx = px - ax; float apy = py - ay; - + float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) + __builtin_abs(pp - ap) + __builtin_abs(pp - bp); - + if (dist > EPS && dist > max_dist) { result = i; max_dist = dist; } } + +#endif return(result); }