Browse Source

AoS -> SoA for point coordinates

ssao
A.Olokhtonov 10 months ago
parent
commit
6f78c0ae21
  1. 18
      client/client_recv.js
  2. 19
      client/math.js
  3. 41
      client/speed.js
  4. 92
      client/wasm/lod.c
  5. BIN
      client/wasm/lod.wasm

18
client/client_recv.js

@ -244,17 +244,23 @@ function handle_event(state, context, event, options = {}) {
wasm_ensure_by(state, 1, event.coords.length); wasm_ensure_by(state, 1, event.coords.length);
const coordinates = state.wasm.buffers['coordinates'];
const pressures = state.wasm.buffers['pressures']; const pressures = state.wasm.buffers['pressures'];
const xs = state.wasm.buffers['xs'];
const ys = state.wasm.buffers['ys'];
event.coords_from = coordinates.tv.size; event.coords_from = xs.tv.size;
event.coords_to = coordinates.tv.size + point_count * 2; event.coords_to = xs.tv.size + point_count;
tv_add(state.wasm.buffers['coords_from'].tv, coordinates.tv.size + point_count * 2); tv_add(state.wasm.buffers['coords_from'].tv, xs.tv.size + point_count);
state.wasm.buffers['coords_from'].used += 4; // 4 bytes, not 4 ints state.wasm.buffers['coords_from'].used += 4; // 4 bytes, not 4 ints
tv_append(coordinates.tv, event.coords); for (let i = 0; i < event.coords.length; i += 2) {
state.wasm.buffers['coordinates'].used += point_count * 2 * 4; tv_add(xs.tv, event.coords[i + 0]);
tv_add(ys.tv, event.coords[i + 1]);
}
state.wasm.buffers['xs'].used += point_count * 4;
state.wasm.buffers['ys'].used += point_count * 4;
tv_append(pressures.tv, event.press); tv_append(pressures.tv, event.press);
state.wasm.buffers['pressures'].used += point_count; state.wasm.buffers['pressures'].used += point_count;

19
client/math.js

@ -43,7 +43,7 @@ function process_rdp_indices_r(state, zoom, mask, stroke, start, end) {
} }
function process_rdp_indices(state, zoom, stroke) { function process_rdp_indices(state, zoom, stroke) {
const point_count = (stroke.coords_to - stroke.coords_from) / 2; const point_count = stroke.coords_to - stroke.coords_from;
if (state.rdp_mask.length < point_count) { if (state.rdp_mask.length < point_count) {
state.rdp_mask = new Uint8Array(point_count); state.rdp_mask = new Uint8Array(point_count);
@ -252,17 +252,18 @@ function segment_interesects_quad(a, b, quad_topleft, quad_bottomright, quad_top
function stroke_bbox(state, stroke) { function stroke_bbox(state, stroke) {
const radius = stroke.width; // do not divide by 2 to account for max possible pressure const radius = stroke.width; // do not divide by 2 to account for max possible pressure
const coordinates = state.wasm.buffers['coordinates'].tv.data; const xs = state.wasm.buffers['xs'].tv.data;
const ys = state.wasm.buffers['ys'].tv.data;
let min_x = coordinates[stroke.coords_from + 0] - radius; let min_x = xs[stroke.coords_from] - radius;
let max_x = coordinates[stroke.coords_from + 0] + radius; let max_x = xs[stroke.coords_from] + radius;
let min_y = coordinates[stroke.coords_from + 1] - radius; let min_y = ys[stroke.coords_from] - radius;
let max_y = coordinates[stroke.coords_from + 1] + radius; let max_y = ys[stroke.coords_from] + radius;
for (let i = stroke.coords_from + 2; i < stroke.coords_to; i += 2) { for (let i = stroke.coords_from + 1; i < stroke.coords_to; ++i) {
const px = coordinates[i + 0]; const px = xs[i];
const py = coordinates[i + 1]; const py = ys[i];
min_x = Math.min(min_x, px - radius); min_x = Math.min(min_x, px - radius);
min_y = Math.min(min_y, py - radius); min_y = Math.min(min_y, py - radius);

41
client/speed.js

@ -7,8 +7,12 @@ async function init_wasm(state) {
state.wasm.stroke_bytes = 4096; state.wasm.stroke_bytes = 4096;
state.wasm.coords_bytes = 4096; state.wasm.coords_bytes = 4096;
state.wasm.buffers = { state.wasm.buffers = {
'coordinates': { 'xs': {
'offset': state.wasm.exports.alloc_static(state.wasm.coords_bytes), 'offset': state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2),
'used': 0
},
'ys': {
'offset': state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2),
'used': 0 'used': 0
}, },
'coords_from': { 'coords_from': {
@ -27,8 +31,10 @@ async function init_wasm(state) {
const mem = state.wasm.exports.memory.buffer; const mem = state.wasm.exports.memory.buffer;
state.wasm.buffers['coordinates'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 4, state.wasm.buffers['xs'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8,
mem, state.wasm.buffers['coordinates'].offset); mem, state.wasm.buffers['xs'].offset);
state.wasm.buffers['ys'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8,
mem, state.wasm.buffers['ys'].offset);
state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, state.wasm.stroke_bytes / 4, state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, state.wasm.stroke_bytes / 4,
mem, state.wasm.buffers['coords_from'].offset); mem, state.wasm.buffers['coords_from'].offset);
state.wasm.buffers['line_threshold'].tv = tv_create_on(Float32Array, state.wasm.stroke_bytes / 4, state.wasm.buffers['line_threshold'].tv = tv_create_on(Float32Array, state.wasm.stroke_bytes / 4,
@ -52,13 +58,13 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
let realloc = false; let realloc = false;
if (buffers['coordinates'].used + ncoords * 4 > state.wasm.coords_bytes) { if (buffers['xs'].used + ncoords * 4 > state.wasm.coords_bytes / 2) {
state.wasm.coords_bytes += round_to_pow2(ncoords, 4096 * 16); // 1 wasm page (although it doesn't matter here) state.wasm.coords_bytes += round_to_pow2(ncoords * 4, 4096 * 16); // 1 wasm page (although it doesn't matter here)
realloc = true; realloc = true;
} }
if (buffers['coords_from'].used + nstrokes * 4 > state.wasm.stroke_bytes) { if (buffers['coords_from'].used + nstrokes * 4 > state.wasm.stroke_bytes / 2) {
state.wasm.stroke_bytes += round_to_pow2(nstrokes, 4096 * 16); state.wasm.stroke_bytes += round_to_pow2(nstrokes * 4, 4096 * 16);
realloc = true; realloc = true;
} }
@ -70,22 +76,26 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
const mem = state.wasm.exports.memory.buffer; const mem = state.wasm.exports.memory.buffer;
const memv = new Uint8Array(mem); const memv = new Uint8Array(mem);
buffers['coordinates'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes); buffers['xs'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2);
buffers['ys'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 2);
buffers['coords_from'].offset = state.wasm.exports.alloc_static(state.wasm.stroke_bytes); buffers['coords_from'].offset = state.wasm.exports.alloc_static(state.wasm.stroke_bytes);
buffers['line_threshold'].offset = state.wasm.exports.alloc_static(state.wasm.stroke_bytes); buffers['line_threshold'].offset = state.wasm.exports.alloc_static(state.wasm.stroke_bytes);
buffers['pressures'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 8); buffers['pressures'].offset = state.wasm.exports.alloc_static(state.wasm.coords_bytes / 8);
buffers['coordinates'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 4, mem, buffers['coordinates'].offset); buffers['xs'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8, mem, buffers['xs'].offset);
buffers['ys'].tv = tv_create_on(Float32Array, state.wasm.coords_bytes / 8, mem, buffers['ys'].offset);
buffers['coords_from'].tv = tv_create_on(Uint32Array, state.wasm.stroke_bytes / 4, mem, buffers['coords_from'].offset); buffers['coords_from'].tv = tv_create_on(Uint32Array, state.wasm.stroke_bytes / 4, mem, buffers['coords_from'].offset);
buffers['line_threshold'].tv = tv_create_on(Float32Array, state.wasm.stroke_bytes / 4, mem, buffers['line_threshold'].offset); buffers['line_threshold'].tv = tv_create_on(Float32Array, state.wasm.stroke_bytes / 4, mem, buffers['line_threshold'].offset);
buffers['pressures'].tv = tv_create_on(Uint8Array, state.wasm.coords_bytes / 8, mem, buffers['pressures'].offset); buffers['pressures'].tv = tv_create_on(Uint8Array, state.wasm.coords_bytes / 8, mem, buffers['pressures'].offset);
buffers['coordinates'].tv.size = buffers['coordinates'].used / 4; // TODO: this should have been automatic maybe?
buffers['xs'].tv.size = buffers['xs'].used / 4;
buffers['ys'].tv.size = buffers['ys'].used / 4;
buffers['coords_from'].tv.size = buffers['coords_from'].used / 4; buffers['coords_from'].tv.size = buffers['coords_from'].used / 4;
buffers['line_threshold'].tv.size = buffers['line_threshold'].used / 4; buffers['line_threshold'].tv.size = buffers['line_threshold'].used / 4;
buffers['pressures'].tv.size = buffers['pressures'].used; buffers['pressures'].tv.size = buffers['pressures'].used;
const tmp = new Uint8Array(Math.max(state.wasm.coords_bytes / 8, state.wasm.stroke_bytes)); // TODO: needed? const tmp = new Uint8Array(Math.max(state.wasm.coords_bytes, state.wasm.stroke_bytes)); // TODO: needed?
// Copy from back to front (otherwise we will overwrite) // Copy from back to front (otherwise we will overwrite)
tmp.set(new Uint8Array(mem, old_pressures_offset, buffers['pressures'].used)); tmp.set(new Uint8Array(mem, old_pressures_offset, buffers['pressures'].used));
@ -113,14 +123,15 @@ function do_lod(state, context) {
clipped_indices, context.clipped_indices.size, state.canvas.zoom, clipped_indices, context.clipped_indices.size, state.canvas.zoom,
buffers['coords_from'].offset, buffers['coords_from'].offset,
buffers['line_threshold'].offset, buffers['line_threshold'].offset,
buffers['coordinates'].offset, buffers['xs'].offset,
buffers['ys'].offset,
buffers['pressures'].offset, buffers['pressures'].offset,
buffers['coordinates'].used / 4, buffers['xs'].used / 4,
); );
// Use results without copying from WASM memory // Use results without copying from WASM memory
const result_offset = clipped_indices + context.clipped_indices.size * 4 const result_offset = clipped_indices + context.clipped_indices.size * 4
+ (context.clipped_indices.size + 1) * 4 + buffers['coordinates'].used / 2; + (context.clipped_indices.size + 1) * 4 + buffers['xs'].used;
const wasm_points = new Float32Array(state.wasm.exports.memory.buffer, const wasm_points = new Float32Array(state.wasm.exports.memory.buffer,
result_offset, segment_count * 2); result_offset, segment_count * 2);

92
client/wasm/lod.c

@ -1,3 +1,5 @@
// clang -g -Wall -Wextra -O3 -Wl,--export-all,--no-entry --target=wasm32 -Xclang -target-feature -Xclang +simd128 lod.c -nostdlib -o lod.wasm
#include <wasm_simd128.h> #include <wasm_simd128.h>
extern char __heap_base; extern char __heap_base;
@ -34,20 +36,18 @@ alloc_dynamic(int size)
} }
static int static int
rdp_find_max(float *coordinates, unsigned char *pressures, float zoom, int coords_from, rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from,
int segment_start, int segment_end) int segment_start, int segment_end)
{ {
float EPS = 0.125 / zoom; float EPS = 0.125f / zoom * 255.0f;
// __i32x4 a = wasm_i32x4_load16x4(coordinates);
int result = -1; int result = -1;
float max_dist = 0.0f; float max_dist = 0.0f;
float ax = coordinates[coords_from + segment_start * 2 + 0]; float ax = xs[coords_from + segment_start];
float ay = coordinates[coords_from + segment_start * 2 + 1]; float ay = ys[coords_from + segment_start];
float bx = coordinates[coords_from + segment_end * 2 + 0]; float bx = xs[coords_from + segment_end];
float by = coordinates[coords_from + segment_end * 2 + 1]; float by = ys[coords_from + segment_end];
unsigned char ap = pressures[coords_from / 2 + segment_start]; unsigned char ap = pressures[coords_from / 2 + segment_start];
unsigned char bp = pressures[coords_from / 2 + segment_end]; unsigned char bp = pressures[coords_from / 2 + segment_end];
@ -56,12 +56,63 @@ rdp_find_max(float *coordinates, unsigned char *pressures, float zoom, int coord
float dy = by - ay; float dy = by - ay;
float dist_ab = __builtin_sqrtf(dx * dx + dy * dy); float dist_ab = __builtin_sqrtf(dx * dx + dy * dy);
float dir_nx = dy / dist_ab; float dir_nx = dy / dist_ab * 255.0f;
float dir_ny = -dx / dist_ab; float dir_ny = -dx / dist_ab * 255.0f;
#if 0
v128_t scale_255 = wasm_f32x4_splat(1.0f / 255.0f);
v128_t EPSs = wasm_f32x4_splat(EPS);
#endif
for (int i = segment_start + 1; i < segment_end; ++i) { for (int i = segment_start + 1; i < segment_end; ++i) {
float px = coordinates[coords_from + i * 2 + 0]; #if 0
float py = coordinates[coords_from + i * 2 + 1]; v128_t pxs = wasm_v128_load(coordinates_x + coords_from + i);
v128_t pxs = wasm_v128_load(coordinates_y + coords_from + i);
v128_t pps = wasm_v128_load(pressures + coords_from + i);
v128_t apxs = wasm_f32x4_sub(pxs, axs);
v128_t apys = wasm_f32x4_sub(pys, ays);
v128_t dists = wasm_f32x4_add(
wasm_f32x4_add(
wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, aps)), scale_255),
wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, bps)), scale_255)
),
wasm_f32x4_abs(
wasm_f32x4_add(
wasm_f32x4_mul(apxs, dir_nxs),
wasm_f32x4_mul(apys, dir_nys)
)
)
);
v128_t dist_mask = wasm_f32x4_gt(dists, EPSs);
v128_t max_mask = wasm_f32x4_gt(dists, max_dists);
v128_t final_mask = wasm_v128_and(dist_mask, max_mask);
if (!wasm_v128_any_true(final_mask)) {
// fast path? hopefully?
continue;
}
// Places max(0, 2) and max(1, 3) into lanes (0, 1)
v128_t max_02_13 = wasm_f32x4_max(
dists,
wasm_i32x4_shuffle(dists, dists, 2, 3, 2, 3)
);
// Places max(max(0, 2), max(1, 3)) into lane 0
v128_t max_0123 = wasm_f32x4_max(
max_02_13,
wasm_i32x4_shuffle(max_02_13, max_02_13, 1, 1, 1, 1)
);
float final_max = wasm_f32x4_extract_lane(max_0123, 0);
#endif
float px = xs[coords_from + i];
float py = ys[coords_from + i];
unsigned char pp = pressures[coords_from + i]; unsigned char pp = pressures[coords_from + i];
@ -69,7 +120,7 @@ rdp_find_max(float *coordinates, unsigned char *pressures, float zoom, int coord
float apy = py - ay; float apy = py - ay;
float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
+ __builtin_abs(pp - ap) / 255.0f + __builtin_abs(pp - bp) / 255.0f; + __builtin_abs(pp - ap) + __builtin_abs(pp - bp);
if (dist > EPS && dist > max_dist) { if (dist > EPS && dist > max_dist) {
result = i; result = i;
@ -84,7 +135,8 @@ int
do_lod(int *clipped_indices, int clipped_count, float zoom, do_lod(int *clipped_indices, int clipped_count, float zoom,
int *stroke_coords_from, int *stroke_coords_from,
float *line_threshold, float *line_threshold,
float *coordinates, float *xs,
float *ys,
unsigned char *pressures, unsigned char *pressures,
int coordinates_count) int coordinates_count)
{ {
@ -93,7 +145,7 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
} }
int *segments_from = alloc_dynamic((clipped_count + 1) * 4); int *segments_from = alloc_dynamic((clipped_count + 1) * 4);
int *segments = alloc_dynamic(coordinates_count / 2 * 4); int *segments = alloc_dynamic(coordinates_count * 4);
int segments_head = 0; int segments_head = 0;
int stack[4096]; // TODO: what's a reasonable max size for this? int stack[4096]; // TODO: what's a reasonable max size for this?
@ -105,7 +157,7 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
int coords_from = stroke_coords_from[stroke_index]; int coords_from = stroke_coords_from[stroke_index];
int coords_to = stroke_coords_from[stroke_index + 1]; int coords_to = stroke_coords_from[stroke_index + 1];
int point_count = (coords_to - coords_from) / 2; int point_count = coords_to - coords_from;
// Basic CSR crap // Basic CSR crap
segments_from[i] = segments_head; segments_from[i] = segments_head;
@ -134,7 +186,7 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
if (type == 1) { if (type == 1) {
segments[segments_head++] = start; segments[segments_head++] = start;
} else { } else {
int max = rdp_find_max(coordinates, pressures, zoom, coords_from, start, end); int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end);
if (max != -1) { if (max != -1) {
segment_count += 1; segment_count += 1;
@ -180,13 +232,13 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
for (int j = from; j < to; ++j) { for (int j = from; j < to; ++j) {
int point_index = segments[j]; int point_index = segments[j];
float x = coordinates[base_stroke + point_index * 2 + 0]; float x = xs[base_stroke + point_index];
float y = coordinates[base_stroke + point_index * 2 + 1]; float y = ys[base_stroke + point_index];
points[phead++] = x; points[phead++] = x;
points[phead++] = y; points[phead++] = y;
pressures_res[ihead] = pressures[base_stroke / 2 + point_index]; pressures_res[ihead] = pressures[base_stroke + point_index];
if (j != to - 1) { if (j != to - 1) {
ids[ihead++] = stroke_index; ids[ihead++] = stroke_index;

BIN
client/wasm/lod.wasm

Binary file not shown.
Loading…
Cancel
Save