diff --git a/client/client_recv.js b/client/client_recv.js index d74ae9d..24ae3f6 100644 --- a/client/client_recv.js +++ b/client/client_recv.js @@ -244,17 +244,23 @@ function handle_event(state, context, event, options = {}) { wasm_ensure_by(state, 1, event.coords.length); - const coordinates = state.wasm.buffers['coordinates']; const pressures = state.wasm.buffers['pressures']; + const xs = state.wasm.buffers['xs']; + const ys = state.wasm.buffers['ys']; + + event.coords_from = xs.tv.size; + event.coords_to = xs.tv.size + point_count; - event.coords_from = coordinates.tv.size; - event.coords_to = coordinates.tv.size + point_count * 2; - - tv_add(state.wasm.buffers['coords_from'].tv, coordinates.tv.size + point_count * 2); + tv_add(state.wasm.buffers['coords_from'].tv, xs.tv.size + point_count); state.wasm.buffers['coords_from'].used += 4; // 4 bytes, not 4 ints - tv_append(coordinates.tv, event.coords); - state.wasm.buffers['coordinates'].used += point_count * 2 * 4; + for (let i = 0; i < event.coords.length; i += 2) { + tv_add(xs.tv, event.coords[i + 0]); + tv_add(ys.tv, event.coords[i + 1]); + } + + state.wasm.buffers['xs'].used += point_count * 4; + state.wasm.buffers['ys'].used += point_count * 4; tv_append(pressures.tv, event.press); state.wasm.buffers['pressures'].used += point_count; diff --git a/client/math.js b/client/math.js index 432fa39..c9c4519 100644 --- a/client/math.js +++ b/client/math.js @@ -43,7 +43,7 @@ function process_rdp_indices_r(state, zoom, mask, stroke, start, end) { } function process_rdp_indices(state, zoom, stroke) { - const point_count = (stroke.coords_to - stroke.coords_from) / 2; + const point_count = stroke.coords_to - stroke.coords_from; if (state.rdp_mask.length < point_count) { state.rdp_mask = new Uint8Array(point_count); @@ -252,17 +252,18 @@ function segment_interesects_quad(a, b, quad_topleft, quad_bottomright, quad_top function stroke_bbox(state, stroke) { const radius = stroke.width; // do not divide by 2 to account for max possible pressure - const coordinates = state.wasm.buffers['coordinates'].tv.data; + const xs = state.wasm.buffers['xs'].tv.data; + const ys = state.wasm.buffers['ys'].tv.data; - let min_x = coordinates[stroke.coords_from + 0] - radius; - let max_x = coordinates[stroke.coords_from + 0] + radius; + let min_x = xs[stroke.coords_from] - radius; + let max_x = xs[stroke.coords_from] + radius; - let min_y = coordinates[stroke.coords_from + 1] - radius; - let max_y = coordinates[stroke.coords_from + 1] + radius; + let min_y = ys[stroke.coords_from] - radius; + let max_y = ys[stroke.coords_from] + radius; - for (let i = stroke.coords_from + 2; i < stroke.coords_to; i += 2) { - const px = coordinates[i + 0]; - const py = coordinates[i + 1]; + for (let i = stroke.coords_from + 1; i < stroke.coords_to; ++i) { + const px = xs[i]; + const py = ys[i]; min_x = Math.min(min_x, px - radius); min_y = Math.min(min_y, py - radius); diff --git a/client/speed.js b/client/speed.js index f4459a8..fbf801a 100644 --- a/client/speed.js +++ b/client/speed.js @@ -7,8 +7,12 @@ async function init_wasm(state) { state.wasm.stroke_bytes = 4096; state.wasm.coords_bytes = 4096; state.wasm.buffers = { - 'coordinates': { - 'offset': state.wasm.exports.alloc_static(state.wasm.coords_bytes), + 'xs': { + 'offset': state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2), + 'used': 0 + }, + 'ys': { + 'offset': state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2), 'used': 0 }, 'coords_from': { @@ -27,8 +31,10 @@ async function init_wasm(state) { const mem = state.wasm.exports.memory.buffer; - state.wasm.buffers['coordinates'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 4, - mem, state.wasm.buffers['coordinates'].offset); + state.wasm.buffers['xs'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8, + mem, state.wasm.buffers['xs'].offset); + state.wasm.buffers['ys'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8, + mem, state.wasm.buffers['ys'].offset); state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, state.wasm.stroke_bytes / 4, mem, state.wasm.buffers['coords_from'].offset); state.wasm.buffers['line_threshold'].tv = tv_create_on(Float32Array, state.wasm.stroke_bytes / 4, @@ -52,13 +58,13 @@ function wasm_ensure_by(state, nstrokes, ncoords) { let realloc = false; - if (buffers['coordinates'].used + ncoords * 4 > state.wasm.coords_bytes) { - state.wasm.coords_bytes += round_to_pow2(ncoords, 4096 * 16); // 1 wasm page (although it doesn't matter here) + if (buffers['xs'].used + ncoords * 4 > state.wasm.coords_bytes / 2) { + state.wasm.coords_bytes += round_to_pow2(ncoords * 4, 4096 * 16); // 1 wasm page (although it doesn't matter here) realloc = true; } - if (buffers['coords_from'].used + nstrokes * 4 > state.wasm.stroke_bytes) { - state.wasm.stroke_bytes += round_to_pow2(nstrokes, 4096 * 16); + if (buffers['coords_from'].used + nstrokes * 4 > state.wasm.stroke_bytes / 2) { + state.wasm.stroke_bytes += round_to_pow2(nstrokes * 4, 4096 * 16); realloc = true; } @@ -70,22 +76,26 @@ function wasm_ensure_by(state, nstrokes, ncoords) { const mem = state.wasm.exports.memory.buffer; const memv = new Uint8Array(mem); - buffers['coordinates'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes); + buffers['xs'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2); + buffers['ys'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2); buffers['coords_from'].offset = state.wasm.exports.alloc_static(state.wasm.stroke_bytes); buffers['line_threshold'].offset = state.wasm.exports.alloc_static(state.wasm.stroke_bytes); buffers['pressures'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 8); - buffers['coordinates'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 4, mem, buffers['coordinates'].offset); + buffers['xs'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8, mem, buffers['xs'].offset); + buffers['ys'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8, mem, buffers['ys'].offset); buffers['coords_from'].tv = tv_create_on(Uint32Array, state.wasm.stroke_bytes / 4, mem, buffers['coords_from'].offset); buffers['line_threshold'].tv = tv_create_on(Float32Array, state.wasm.stroke_bytes / 4, mem, buffers['line_threshold'].offset); buffers['pressures'].tv = tv_create_on(Uint8Array, state.wasm.coords_bytes / 8, mem, buffers['pressures'].offset); - buffers['coordinates'].tv.size = buffers['coordinates'].used / 4; + // TODO: this should have been automatic maybe? + buffers['xs'].tv.size = buffers['xs'].used / 4; + buffers['ys'].tv.size = buffers['ys'].used / 4; buffers['coords_from'].tv.size = buffers['coords_from'].used / 4; buffers['line_threshold'].tv.size = buffers['line_threshold'].used / 4; buffers['pressures'].tv.size = buffers['pressures'].used; - const tmp = new Uint8Array(Math.max(state.wasm.coords_bytes / 8, state.wasm.stroke_bytes)); // TODO: needed? + const tmp = new Uint8Array(Math.max(state.wasm.coords_bytes, state.wasm.stroke_bytes)); // TODO: needed? // Copy from back to front (otherwise we will overwrite) tmp.set(new Uint8Array(mem, old_pressures_offset, buffers['pressures'].used)); @@ -113,14 +123,15 @@ function do_lod(state, context) { clipped_indices, context.clipped_indices.size, state.canvas.zoom, buffers['coords_from'].offset, buffers['line_threshold'].offset, - buffers['coordinates'].offset, + buffers['xs'].offset, + buffers['ys'].offset, buffers['pressures'].offset, - buffers['coordinates'].used / 4, + buffers['xs'].used / 4, ); // Use results without copying from WASM memory const result_offset = clipped_indices + context.clipped_indices.size * 4 - + (context.clipped_indices.size + 1) * 4 + buffers['coordinates'].used / 2; + + (context.clipped_indices.size + 1) * 4 + buffers['xs'].used; const wasm_points = new Float32Array(state.wasm.exports.memory.buffer, result_offset, segment_count * 2); diff --git a/client/wasm/lod.c b/client/wasm/lod.c index 19a13e3..2c3420d 100644 --- a/client/wasm/lod.c +++ b/client/wasm/lod.c @@ -1,3 +1,5 @@ +// clang -g -Wall -Wextra -O3 -Wl,--export-all,--no-entry --target=wasm32 -Xclang -target-feature -Xclang +simd128 lod.c -nostdlib -o lod.wasm + #include extern char __heap_base; @@ -34,20 +36,18 @@ alloc_dynamic(int size) } static int -rdp_find_max(float *coordinates, unsigned char *pressures, float zoom, int coords_from, +rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from, int segment_start, int segment_end) { - float EPS = 0.125 / zoom; + float EPS = 0.125f / zoom * 255.0f; -// __i32x4 a = wasm_i32x4_load16x4(coordinates); - int result = -1; float max_dist = 0.0f; - float ax = coordinates[coords_from + segment_start * 2 + 0]; - float ay = coordinates[coords_from + segment_start * 2 + 1]; - float bx = coordinates[coords_from + segment_end * 2 + 0]; - float by = coordinates[coords_from + segment_end * 2 + 1]; + float ax = xs[coords_from + segment_start]; + float ay = ys[coords_from + segment_start]; + float bx = xs[coords_from + segment_end]; + float by = ys[coords_from + segment_end]; unsigned char ap = pressures[coords_from / 2 + segment_start]; unsigned char bp = pressures[coords_from / 2 + segment_end]; @@ -56,12 +56,63 @@ rdp_find_max(float *coordinates, unsigned char *pressures, float zoom, int coord float dy = by - ay; float dist_ab = __builtin_sqrtf(dx * dx + dy * dy); - float dir_nx = dy / dist_ab; - float dir_ny = -dx / dist_ab; + float dir_nx = dy / dist_ab * 255.0f; + float dir_ny = -dx / dist_ab * 255.0f; + +#if 0 + v128_t scale_255 = wasm_f32x4_splat(1.0f / 255.0f); + v128_t EPSs = wasm_f32x4_splat(EPS); +#endif for (int i = segment_start + 1; i < segment_end; ++i) { - float px = coordinates[coords_from + i * 2 + 0]; - float py = coordinates[coords_from + i * 2 + 1]; +#if 0 + v128_t pxs = wasm_v128_load(coordinates_x + coords_from + i); + v128_t pxs = wasm_v128_load(coordinates_y + coords_from + i); + + v128_t pps = wasm_v128_load(pressures + coords_from + i); + + v128_t apxs = wasm_f32x4_sub(pxs, axs); + v128_t apys = wasm_f32x4_sub(pys, ays); + + v128_t dists = wasm_f32x4_add( + wasm_f32x4_add( + wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, aps)), scale_255), + wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, bps)), scale_255) + ), + wasm_f32x4_abs( + wasm_f32x4_add( + wasm_f32x4_mul(apxs, dir_nxs), + wasm_f32x4_mul(apys, dir_nys) + ) + ) + ); + + v128_t dist_mask = wasm_f32x4_gt(dists, EPSs); + v128_t max_mask = wasm_f32x4_gt(dists, max_dists); + v128_t final_mask = wasm_v128_and(dist_mask, max_mask); + + if (!wasm_v128_any_true(final_mask)) { + // fast path? hopefully? + continue; + } + + // Places max(0, 2) and max(1, 3) into lanes (0, 1) + v128_t max_02_13 = wasm_f32x4_max( + dists, + wasm_i32x4_shuffle(dists, dists, 2, 3, 2, 3) + ); + + // Places max(max(0, 2), max(1, 3)) into lane 0 + v128_t max_0123 = wasm_f32x4_max( + max_02_13, + wasm_i32x4_shuffle(max_02_13, max_02_13, 1, 1, 1, 1) + ); + + float final_max = wasm_f32x4_extract_lane(max_0123, 0); +#endif + + float px = xs[coords_from + i]; + float py = ys[coords_from + i]; unsigned char pp = pressures[coords_from + i]; @@ -69,7 +120,7 @@ rdp_find_max(float *coordinates, unsigned char *pressures, float zoom, int coord float apy = py - ay; float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) - + __builtin_abs(pp - ap) / 255.0f + __builtin_abs(pp - bp) / 255.0f; + + __builtin_abs(pp - ap) + __builtin_abs(pp - bp); if (dist > EPS && dist > max_dist) { result = i; @@ -84,7 +135,8 @@ int do_lod(int *clipped_indices, int clipped_count, float zoom, int *stroke_coords_from, float *line_threshold, - float *coordinates, + float *xs, + float *ys, unsigned char *pressures, int coordinates_count) { @@ -93,7 +145,7 @@ do_lod(int *clipped_indices, int clipped_count, float zoom, } int *segments_from = alloc_dynamic((clipped_count + 1) * 4); - int *segments = alloc_dynamic(coordinates_count / 2 * 4); + int *segments = alloc_dynamic(coordinates_count * 4); int segments_head = 0; int stack[4096]; // TODO: what's a reasonable max size for this? @@ -105,7 +157,7 @@ do_lod(int *clipped_indices, int clipped_count, float zoom, int coords_from = stroke_coords_from[stroke_index]; int coords_to = stroke_coords_from[stroke_index + 1]; - int point_count = (coords_to - coords_from) / 2; + int point_count = coords_to - coords_from; // Basic CSR crap segments_from[i] = segments_head; @@ -134,7 +186,7 @@ do_lod(int *clipped_indices, int clipped_count, float zoom, if (type == 1) { segments[segments_head++] = start; } else { - int max = rdp_find_max(coordinates, pressures, zoom, coords_from, start, end); + int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end); if (max != -1) { segment_count += 1; @@ -180,13 +232,13 @@ do_lod(int *clipped_indices, int clipped_count, float zoom, for (int j = from; j < to; ++j) { int point_index = segments[j]; - float x = coordinates[base_stroke + point_index * 2 + 0]; - float y = coordinates[base_stroke + point_index * 2 + 1]; + float x = xs[base_stroke + point_index]; + float y = ys[base_stroke + point_index]; points[phead++] = x; points[phead++] = y; - pressures_res[ihead] = pressures[base_stroke / 2 + point_index]; + pressures_res[ihead] = pressures[base_stroke + point_index]; if (j != to - 1) { ids[ihead++] = stroke_index; diff --git a/client/wasm/lod.wasm b/client/wasm/lod.wasm index 64b42bf..3f921f6 100755 Binary files a/client/wasm/lod.wasm and b/client/wasm/lod.wasm differ