Compare commits

...

2 Commits

Author SHA1 Message Date
A.Olokhtonov 4a6715ef66 Batching is close to working. Not quite though 10 months ago
A.Olokhtonov ce824a8e31 Pass stroke widths to wasm (not actually writing any values right now, 10 months ago
  1. 1
      client/index.js
  2. 2
      client/lod_worker.js
  3. 36
      client/speed.js
  4. 61
      client/wasm/lod.c
  5. BIN
      client/wasm/lod.wasm
  6. 31
      client/webgl_draw.js
  7. 3
      client/webgl_geometry.js

1
client/index.js

@ -227,6 +227,7 @@ async function main() {
'instance_data_points': tv_create(Float32Array, 4096), 'instance_data_points': tv_create(Float32Array, 4096),
'instance_data_ids': tv_create(Uint32Array, 4096), 'instance_data_ids': tv_create(Uint32Array, 4096),
'instance_data_pressures': tv_create(Uint8Array, 4096), 'instance_data_pressures': tv_create(Uint8Array, 4096),
'instance_data_batches': tv_create(Uint32Array, 4096),
'dynamic_instance_points': tv_create(Float32Array, 4096), 'dynamic_instance_points': tv_create(Float32Array, 4096),
'dynamic_instance_pressure': tv_create(Uint8Array, 4096), 'dynamic_instance_pressure': tv_create(Uint8Array, 4096),

2
client/lod_worker.js

@ -19,11 +19,13 @@ function work(indices_base, indices_count, zoom, offsets) {
exports.do_lod( exports.do_lod(
indices_base, indices_count, zoom, indices_base, indices_count, zoom,
offsets['coords_from'], offsets['coords_from'],
offsets['width'],
offsets['xs'], offsets['xs'],
offsets['ys'], offsets['ys'],
offsets['pressures'], offsets['pressures'],
offsets['result_buffers'] + thread_id * 4, offsets['result_buffers'] + thread_id * 4,
offsets['result_counts'] + thread_id * 4, offsets['result_counts'] + thread_id * 4,
offsets['result_batch_counts'] + thread_id * 4,
); );
} catch (e) { } catch (e) {
console.error('WASM:', e); console.error('WASM:', e);

36
client/speed.js

@ -43,7 +43,7 @@ async function init_wasm(state) {
env: { 'memory': memory } env: { 'memory': memory }
}); });
const nworkers = navigator.hardwareConcurrency; const nworkers = 1; //navigator.hardwareConcurrency;
state.wasm.exports = master_wasm.instance.exports; state.wasm.exports = master_wasm.instance.exports;
state.wasm.heap_base = state.wasm.exports.alloc_static(0); state.wasm.heap_base = state.wasm.exports.alloc_static(0);
@ -80,12 +80,17 @@ async function init_wasm(state) {
'used': 0, 'used': 0,
'cap': initial 'cap': initial
}, },
'width': {
'used': 0,
'cap': initial
}
}; };
state.wasm.buffers['xs'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['xs'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['ys'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['ys'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['pressures'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['pressures'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['coords_from'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['coords_from'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['width'].offset = state.wasm.exports.alloc_static(initial);
const mem = state.wasm.memory.buffer; const mem = state.wasm.memory.buffer;
@ -97,6 +102,8 @@ async function init_wasm(state) {
mem, state.wasm.buffers['pressures'].offset); mem, state.wasm.buffers['pressures'].offset);
state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, initial / 4, state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, initial / 4,
mem, state.wasm.buffers['coords_from'].offset); mem, state.wasm.buffers['coords_from'].offset);
state.wasm.buffers['width'].tv = tv_create_on(Uint32Array, initial / 4,
mem, state.wasm.buffers['width'].offset);
tv_add(state.wasm.buffers['coords_from'].tv, 0); tv_add(state.wasm.buffers['coords_from'].tv, 0);
state.wasm.buffers['coords_from'].used = 4; state.wasm.buffers['coords_from'].used = 4;
@ -108,6 +115,7 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
const old_ys_offset = buffers['ys'].offset; const old_ys_offset = buffers['ys'].offset;
const old_coords_from_offset = buffers['coords_from'].offset; const old_coords_from_offset = buffers['coords_from'].offset;
const old_pressures_offset = buffers['pressures'].offset; const old_pressures_offset = buffers['pressures'].offset;
const old_width_offset = buffers['width'].offset;
let realloc = false; let realloc = false;
let coords_bytes = buffers['xs'].cap; let coords_bytes = buffers['xs'].cap;
@ -135,23 +143,31 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
buffers['ys'].offset = state.wasm.exports.alloc_static(coords_bytes); buffers['ys'].offset = state.wasm.exports.alloc_static(coords_bytes);
buffers['pressures'].offset = state.wasm.exports.alloc_static(coords_bytes); buffers['pressures'].offset = state.wasm.exports.alloc_static(coords_bytes);
buffers['coords_from'].offset = state.wasm.exports.alloc_static(stroke_bytes); buffers['coords_from'].offset = state.wasm.exports.alloc_static(stroke_bytes);
buffers['width'].offset = state.wasm.exports.alloc_static(stroke_bytes);
buffers['xs'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['xs'].offset); buffers['xs'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['xs'].offset);
buffers['ys'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['ys'].offset); buffers['ys'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['ys'].offset);
buffers['pressures'].tv = tv_create_on(Uint8Array, coords_bytes, mem, buffers['pressures'].offset); buffers['pressures'].tv = tv_create_on(Uint8Array, coords_bytes, mem, buffers['pressures'].offset);
buffers['coords_from'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['coords_from'].offset); buffers['coords_from'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['coords_from'].offset);
buffers['width'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['width'].offset);
// TODO: this should have been automatic maybe? // TODO: this should have been automatic maybe?
buffers['xs'].tv.size = buffers['xs'].used / 4; buffers['xs'].tv.size = buffers['xs'].used / 4;
buffers['ys'].tv.size = buffers['ys'].used / 4; buffers['ys'].tv.size = buffers['ys'].used / 4;
buffers['pressures'].tv.size = buffers['pressures'].used; buffers['pressures'].tv.size = buffers['pressures'].used;
buffers['coords_from'].tv.size = buffers['coords_from'].used / 4; buffers['coords_from'].tv.size = buffers['coords_from'].used / 4;
buffers['width'].tv.size = buffers['width'].used / 4;
// TODO: this is SUS, should all the caps really be coords_bytes?
buffers['xs'].cap = buffers['ys'].cap = buffers['pressures'].cap = coords_bytes; buffers['xs'].cap = buffers['ys'].cap = buffers['pressures'].cap = coords_bytes;
buffers['coords_from'].cap = stroke_bytes; buffers['coords_from'].cap = buffers['width'].cap = stroke_bytes;
const tmp = new Uint8Array(Math.max(coords_bytes, stroke_bytes)); const tmp = new Uint8Array(Math.max(coords_bytes, stroke_bytes));
// Copy from back to front (otherwise we will overwrite) // Copy from back to front (otherwise we will overwrite)
tmp.set(new Uint8Array(mem, old_width_offset, buffers['width'].used));
memv.set(new Uint8Array(tmp.buffer, 0, buffers['width'].used), buffers['width'].offset);
tmp.set(new Uint8Array(mem, old_coords_from_offset, buffers['coords_from'].used)); tmp.set(new Uint8Array(mem, old_coords_from_offset, buffers['coords_from'].used));
memv.set(new Uint8Array(tmp.buffer, 0, buffers['coords_from'].used), buffers['coords_from'].offset); memv.set(new Uint8Array(tmp.buffer, 0, buffers['coords_from'].used), buffers['coords_from'].offset);
@ -169,6 +185,7 @@ async function do_lod(state, context) {
const buffers = state.wasm.buffers; const buffers = state.wasm.buffers;
const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4); const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4); const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const result_batch_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4); const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4);
const mem = new Uint8Array(state.wasm.memory.buffer); const mem = new Uint8Array(state.wasm.memory.buffer);
@ -180,11 +197,13 @@ async function do_lod(state, context) {
const indices_per_thread = Math.floor(context.clipped_indices.size / state.wasm.workers.length); const indices_per_thread = Math.floor(context.clipped_indices.size / state.wasm.workers.length);
const offsets = { const offsets = {
'coords_from': buffers['coords_from'].offset, 'coords_from': buffers['coords_from'].offset,
'width': buffers['width'].offset,
'xs': buffers['xs'].offset, 'xs': buffers['xs'].offset,
'ys': buffers['ys'].offset, 'ys': buffers['ys'].offset,
'pressures': buffers['pressures'].offset, 'pressures': buffers['pressures'].offset,
'result_buffers': result_buffers, 'result_buffers': result_buffers,
'result_counts': result_counts, 'result_counts': result_counts,
'result_batch_counts': result_batch_counts,
}; };
const jobs = []; const jobs = [];
@ -209,11 +228,13 @@ async function do_lod(state, context) {
const result_offset = state.wasm.exports.merge_results( const result_offset = state.wasm.exports.merge_results(
result_counts, result_counts,
result_batch_counts,
result_buffers, result_buffers,
state.wasm.workers.length state.wasm.workers.length
); );
const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention
const batch_count = new Int32Array(state.wasm.memory.buffer, result_batch_counts, 1)[0]; // by convention
// Use results without copying from WASM memory // Use results without copying from WASM memory
const wasm_points = new Float32Array(state.wasm.memory.buffer, const wasm_points = new Float32Array(state.wasm.memory.buffer,
@ -221,16 +242,25 @@ async function do_lod(state, context) {
const wasm_ids = new Uint32Array(state.wasm.memory.buffer, const wasm_ids = new Uint32Array(state.wasm.memory.buffer,
result_offset + segment_count * 2 * 4, segment_count); result_offset + segment_count * 2 * 4, segment_count);
const wasm_pressures = new Uint8Array(state.wasm.memory.buffer, const wasm_pressures = new Uint8Array(state.wasm.memory.buffer,
result_offset + segment_count * 2 * 4 + segment_count * 4, segment_count); result_offset + segment_count * 3 * 4, segment_count);
const wasm_batches = new Int32Array(state.wasm.memory.buffer,
result_offset + round_to_pow2(segment_count * (3 * 4 + 1), 4), batch_count * 2);
context.instance_data_points.data = wasm_points; context.instance_data_points.data = wasm_points;
context.instance_data_points.size = segment_count * 2; context.instance_data_points.size = segment_count * 2;
context.instance_data_points.capacity = segment_count * 2;
context.instance_data_ids.data = wasm_ids; context.instance_data_ids.data = wasm_ids;
context.instance_data_ids.size = segment_count; context.instance_data_ids.size = segment_count;
context.instance_data_ids.capacity = segment_count;
context.instance_data_pressures.data = wasm_pressures; context.instance_data_pressures.data = wasm_pressures;
context.instance_data_pressures.size = segment_count; context.instance_data_pressures.size = segment_count;
context.instance_data_pressures.capacity = segment_count;
context.instance_data_batches.data = wasm_batches;
context.instance_data_batches.size = batch_count * 2;
context.instance_data_batches.capacity = batch_count * 2;
return segment_count; return segment_count;
} }

61
client/wasm/lod.c

@ -197,11 +197,13 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
void void
do_lod(int *clipped_indices, int clipped_count, float zoom, do_lod(int *clipped_indices, int clipped_count, float zoom,
int *stroke_coords_from, int *stroke_coords_from,
int *width,
float *xs, float *xs,
float *ys, float *ys,
unsigned char *pressures, unsigned char *pressures,
char **result_buffer, char **result_buffer,
int *result_count) int *result_count,
int *result_batch_count)
{ {
if (clipped_count == 0) { if (clipped_count == 0) {
result_count[0] = 0; result_count[0] = 0;
@ -280,18 +282,22 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
// Write actual coordinates (points) and stroke ids // Write actual coordinates (points) and stroke ids
// Do this in one allocation so that they're not interleaved between threads // Do this in one allocation so that they're not interleaved between threads
char *output = alloc_dynamic(segments_head * (3 * 4 + 1)); char *output = alloc_dynamic(round_to_pow2(segments_head * (3 * 4 + 1), 4) + clipped_count * 4 * 2); // max two ints per stroke for batch info (realistically, much less)
float *points = (float *) output; float *points = (float *) output;
int *ids = (int *) (output + segments_head * 4 * 2); int *ids = (int *) (output + segments_head * 4 * 2);
unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3); unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);
int *batches = (int *) (output + round_to_pow2(segments_head * (4 * 3 + 1), 4));
int phead = 0; int phead = 0;
int ihead = 0; int ihead = 0;
float sqrt_zoom = __builtin_sqrtf(zoom);
int last_lod = -100;
int batch_count = 0;
int batch_size = 0;
for (int i = 0; i < clipped_count; ++i) { for (int i = 0; i < clipped_count; ++i) {
int stroke_index = clipped_indices[i]; int stroke_index = clipped_indices[i];
// TODO: convert to a proper CSR, save half the memory
int base_stroke = stroke_coords_from[stroke_index]; int base_stroke = stroke_coords_from[stroke_index];
int from = segments_from[i]; int from = segments_from[i];
int to = segments_from[i + 1]; int to = segments_from[i + 1];
@ -312,35 +318,77 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
ids[ihead++] = stroke_index | (1 << 31); ids[ihead++] = stroke_index | (1 << 31);
} }
} }
int segment_count = to - from;
// Compute recommended LOD level, add to current batch or start new batch
float sqrt_width = __builtin_sqrtf(width[stroke_index]);
int lod = __builtin_ceil(sqrt_zoom * sqrt_width * 0.3333f); // TODO: round
if (lod > 7) lod = 7;
if (batch_size > 0 && __builtin_abs(lod - last_lod) > 2) {
// Start new batch
batches[batch_count * 2 + 0] = batch_size;
batches[batch_count * 2 + 1] = last_lod;
++batch_count;
batch_size = 0;
}
batch_size += segment_count;
last_lod = lod;
} }
if (batch_size > 0) {
batches[batch_count * 2 + 0] = batch_size;
batches[batch_count * 2 + 1] = last_lod;
++batch_count;
}
result_buffer[0] = output; result_buffer[0] = output;
result_count[0] = segments_head; result_count[0] = segments_head;
result_batch_count[0] = batch_count;
} }
// NOT thread-safe, only call from one thread // NOT thread-safe, only call from one thread
char * char *
merge_results(int *segment_counts, char **buffers, int nthreads) merge_results(int *segment_counts, int *batch_counts, char **buffers, int nthreads)
{ {
int total_segments = 0; int total_segments = 0;
int total_batches = 0;
for (int i = 0; i < nthreads; ++i) { for (int i = 0; i < nthreads; ++i) {
total_segments += segment_counts[i]; total_segments += segment_counts[i];
total_batches += batch_counts[i];
} }
char *merged = alloc_dynamic(total_segments * (3 * 4 + 1)); char *merged = alloc_dynamic(round_to_pow2(total_segments * (3 * 4 + 1), 4) + total_batches * 4);
float *points = (float *) merged; float *points = (float *) merged;
int *ids = (int *) (merged + total_segments * 4 * 2); int *ids = (int *) (merged + total_segments * 4 * 2);
unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3); unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);
int *batches = (int *) (merged + round_to_pow2(total_segments * (3 * 4 + 1), 4));
int batch_base = 0;
int last_batch_lod = -99;
int bhead = 0;
int written_batches = 0;
for (int i = 0; i < nthreads; ++i) { for (int i = 0; i < nthreads; ++i) {
int segments = segment_counts[i]; int segments = segment_counts[i];
int nbatches = batch_counts[i];
int *thread_batches = (int *) (buffers[i] + round_to_pow2(segments * (4 * 3 + 1), 4));
if (segments > 0) { if (segments > 0) {
__builtin_memcpy(points, buffers[i], segments * 4 * 2); __builtin_memcpy(points, buffers[i], segments * 4 * 2);
__builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4); __builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
__builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments); __builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);
for (int j = 0; j < nbatches * 2; j += 2) {
batches[bhead++] = written_batches;
batches[bhead++] = thread_batches[j + 1];
written_batches += thread_batches[j + 0];
}
points += segments * 2; points += segments * 2;
ids += segments; ids += segments;
pressures += segments; pressures += segments;
@ -348,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
} }
segment_counts[0] = total_segments; segment_counts[0] = total_segments;
batch_counts[0] = total_batches;
return(merged); return(merged);
} }

BIN
client/wasm/lod.wasm

Binary file not shown.

31
client/webgl_draw.js

@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
} }
} }
// TODO: what do we do with this // TODO: @speed we can do this once at startup
const circle_lod = Math.round(Math.min(7, 3 * Math.sqrt(state.canvas.zoom)));
const lod_levels = []; const lod_levels = [];
let total_lod_floats = 0; let total_lod_floats = 0;
let total_lod_indices = 0; let total_lod_indices = 0;
let stat_total_vertices = 0; let stat_total_vertices = 0;
for (let i = 0; i <= 7; ++i) { for (let i = 0; i <= 7; ++i) {
const d = geometry_good_circle_and_dummy(i); const d = geometry_good_circle_and_dummy(i);
lod_levels.push({ lod_levels.push({
@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
if (segment_count > 0) { if (segment_count > 0) {
const pr = programs['main']; const pr = programs['main'];
const nbatches = 10; // Last pair (lod unused) to have a proper from;to
const batches = []; tv_add2(context.instance_data_batches, segment_count);
tv_add2(context.instance_data_batches, -1);
for (let i = 0; i < nbatches; ++i) {
batches.push({
'index': Math.floor(segment_count / nbatches * i),
'lod': circle_lod,
});
if (i % 2 == 1) {
batches[batches.length - 1].lod = Math.max(0, batches[batches.length - 1].lod - 4);
}
}
batches.push({'index': segment_count, 'lod': -1}); // lod unused
gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images
gl.useProgram(pr.program); gl.useProgram(pr.program);
@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1); gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1);
gl.vertexAttribDivisor(pr.locations['a_pressure'], 1); gl.vertexAttribDivisor(pr.locations['a_pressure'], 1);
for (let b = 0; b < batches.length - 1; ++b) { for (let b = 0; b < context.instance_data_batches.size - 2; b += 2) {
const batch = batches[b]; const batch_from = context.instance_data_batches.data[b + 0];
const batch_from = batches[b].index; const batch_size = context.instance_data_batches.data[b + 2] - batch_from;
const batch_size = batches[b + 1].index - batch_from; const level = lod_levels[context.instance_data_batches.data[b + 1]];
const level = lod_levels[batch.lod];
if (batch_size > 0) { if (batch_size > 0) {
stat_total_vertices += batch_size * level.data.indices.size; stat_total_vertices += batch_size * level.data.indices.size;
@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
<span>Strokes onscreen: ${context.clipped_indices.size}</span> <span>Strokes onscreen: ${context.clipped_indices.size}</span>
<span>Segments onscreen: ${segment_count}</span> <span>Segments onscreen: ${segment_count}</span>
<span>Total vertices: ${stat_total_vertices}</span> <span>Total vertices: ${stat_total_vertices}</span>
<span>Circle LOD: ${circle_lod}</span>
<span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span> <span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span>
<span>Canvas zoom level: ${state.canvas.zoom_level}</span> <span>Canvas zoom level: ${state.canvas.zoom_level}</span>
<span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`; <span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`;

3
client/webgl_geometry.js

@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
ser_u16(context.stroke_data, b); ser_u16(context.stroke_data, b);
ser_u16(context.stroke_data, stroke.width); ser_u16(context.stroke_data, stroke.width);
tv_add(state.wasm.buffers['width'].tv, stroke.width);
state.wasm.buffers['width'].used += 4;
if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke); if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke);
} }

Loading…
Cancel
Save