Compare commits

...

2 Commits

Author SHA1 Message Date
A.Olokhtonov 4a6715ef66 Batching is close to working. Not quite though 2 weeks ago
A.Olokhtonov ce824a8e31 Pass stroke widths to wasm (not actually writing any values right now, 2 weeks ago
  1. 1
      client/index.js
  2. 2
      client/lod_worker.js
  3. 36
      client/speed.js
  4. 61
      client/wasm/lod.c
  5. BIN
      client/wasm/lod.wasm
  6. 31
      client/webgl_draw.js
  7. 3
      client/webgl_geometry.js

1
client/index.js

@ -227,6 +227,7 @@ async function main() {
'instance_data_points': tv_create(Float32Array, 4096), 'instance_data_points': tv_create(Float32Array, 4096),
'instance_data_ids': tv_create(Uint32Array, 4096), 'instance_data_ids': tv_create(Uint32Array, 4096),
'instance_data_pressures': tv_create(Uint8Array, 4096), 'instance_data_pressures': tv_create(Uint8Array, 4096),
'instance_data_batches': tv_create(Uint32Array, 4096),
'dynamic_instance_points': tv_create(Float32Array, 4096), 'dynamic_instance_points': tv_create(Float32Array, 4096),
'dynamic_instance_pressure': tv_create(Uint8Array, 4096), 'dynamic_instance_pressure': tv_create(Uint8Array, 4096),

2
client/lod_worker.js

@ -19,11 +19,13 @@ function work(indices_base, indices_count, zoom, offsets) {
exports.do_lod( exports.do_lod(
indices_base, indices_count, zoom, indices_base, indices_count, zoom,
offsets['coords_from'], offsets['coords_from'],
offsets['width'],
offsets['xs'], offsets['xs'],
offsets['ys'], offsets['ys'],
offsets['pressures'], offsets['pressures'],
offsets['result_buffers'] + thread_id * 4, offsets['result_buffers'] + thread_id * 4,
offsets['result_counts'] + thread_id * 4, offsets['result_counts'] + thread_id * 4,
offsets['result_batch_counts'] + thread_id * 4,
); );
} catch (e) { } catch (e) {
console.error('WASM:', e); console.error('WASM:', e);

36
client/speed.js

@ -43,7 +43,7 @@ async function init_wasm(state) {
env: { 'memory': memory } env: { 'memory': memory }
}); });
const nworkers = navigator.hardwareConcurrency; const nworkers = 1; //navigator.hardwareConcurrency;
state.wasm.exports = master_wasm.instance.exports; state.wasm.exports = master_wasm.instance.exports;
state.wasm.heap_base = state.wasm.exports.alloc_static(0); state.wasm.heap_base = state.wasm.exports.alloc_static(0);
@ -80,12 +80,17 @@ async function init_wasm(state) {
'used': 0, 'used': 0,
'cap': initial 'cap': initial
}, },
'width': {
'used': 0,
'cap': initial
}
}; };
state.wasm.buffers['xs'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['xs'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['ys'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['ys'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['pressures'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['pressures'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['coords_from'].offset = state.wasm.exports.alloc_static(initial); state.wasm.buffers['coords_from'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['width'].offset = state.wasm.exports.alloc_static(initial);
const mem = state.wasm.memory.buffer; const mem = state.wasm.memory.buffer;
@ -97,6 +102,8 @@ async function init_wasm(state) {
mem, state.wasm.buffers['pressures'].offset); mem, state.wasm.buffers['pressures'].offset);
state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, initial / 4, state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, initial / 4,
mem, state.wasm.buffers['coords_from'].offset); mem, state.wasm.buffers['coords_from'].offset);
state.wasm.buffers['width'].tv = tv_create_on(Uint32Array, initial / 4,
mem, state.wasm.buffers['width'].offset);
tv_add(state.wasm.buffers['coords_from'].tv, 0); tv_add(state.wasm.buffers['coords_from'].tv, 0);
state.wasm.buffers['coords_from'].used = 4; state.wasm.buffers['coords_from'].used = 4;
@ -108,6 +115,7 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
const old_ys_offset = buffers['ys'].offset; const old_ys_offset = buffers['ys'].offset;
const old_coords_from_offset = buffers['coords_from'].offset; const old_coords_from_offset = buffers['coords_from'].offset;
const old_pressures_offset = buffers['pressures'].offset; const old_pressures_offset = buffers['pressures'].offset;
const old_width_offset = buffers['width'].offset;
let realloc = false; let realloc = false;
let coords_bytes = buffers['xs'].cap; let coords_bytes = buffers['xs'].cap;
@ -135,23 +143,31 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
buffers['ys'].offset = state.wasm.exports.alloc_static(coords_bytes); buffers['ys'].offset = state.wasm.exports.alloc_static(coords_bytes);
buffers['pressures'].offset = state.wasm.exports.alloc_static(coords_bytes); buffers['pressures'].offset = state.wasm.exports.alloc_static(coords_bytes);
buffers['coords_from'].offset = state.wasm.exports.alloc_static(stroke_bytes); buffers['coords_from'].offset = state.wasm.exports.alloc_static(stroke_bytes);
buffers['width'].offset = state.wasm.exports.alloc_static(stroke_bytes);
buffers['xs'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['xs'].offset); buffers['xs'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['xs'].offset);
buffers['ys'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['ys'].offset); buffers['ys'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['ys'].offset);
buffers['pressures'].tv = tv_create_on(Uint8Array, coords_bytes, mem, buffers['pressures'].offset); buffers['pressures'].tv = tv_create_on(Uint8Array, coords_bytes, mem, buffers['pressures'].offset);
buffers['coords_from'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['coords_from'].offset); buffers['coords_from'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['coords_from'].offset);
buffers['width'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['width'].offset);
// TODO: this should have been automatic maybe? // TODO: this should have been automatic maybe?
buffers['xs'].tv.size = buffers['xs'].used / 4; buffers['xs'].tv.size = buffers['xs'].used / 4;
buffers['ys'].tv.size = buffers['ys'].used / 4; buffers['ys'].tv.size = buffers['ys'].used / 4;
buffers['pressures'].tv.size = buffers['pressures'].used; buffers['pressures'].tv.size = buffers['pressures'].used;
buffers['coords_from'].tv.size = buffers['coords_from'].used / 4; buffers['coords_from'].tv.size = buffers['coords_from'].used / 4;
buffers['width'].tv.size = buffers['width'].used / 4;
// TODO: this is SUS, should all the caps really be coords_bytes?
buffers['xs'].cap = buffers['ys'].cap = buffers['pressures'].cap = coords_bytes; buffers['xs'].cap = buffers['ys'].cap = buffers['pressures'].cap = coords_bytes;
buffers['coords_from'].cap = stroke_bytes; buffers['coords_from'].cap = buffers['width'].cap = stroke_bytes;
const tmp = new Uint8Array(Math.max(coords_bytes, stroke_bytes)); const tmp = new Uint8Array(Math.max(coords_bytes, stroke_bytes));
// Copy from back to front (otherwise we will overwrite) // Copy from back to front (otherwise we will overwrite)
tmp.set(new Uint8Array(mem, old_width_offset, buffers['width'].used));
memv.set(new Uint8Array(tmp.buffer, 0, buffers['width'].used), buffers['width'].offset);
tmp.set(new Uint8Array(mem, old_coords_from_offset, buffers['coords_from'].used)); tmp.set(new Uint8Array(mem, old_coords_from_offset, buffers['coords_from'].used));
memv.set(new Uint8Array(tmp.buffer, 0, buffers['coords_from'].used), buffers['coords_from'].offset); memv.set(new Uint8Array(tmp.buffer, 0, buffers['coords_from'].used), buffers['coords_from'].offset);
@ -169,6 +185,7 @@ async function do_lod(state, context) {
const buffers = state.wasm.buffers; const buffers = state.wasm.buffers;
const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4); const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4); const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const result_batch_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4); const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4);
const mem = new Uint8Array(state.wasm.memory.buffer); const mem = new Uint8Array(state.wasm.memory.buffer);
@ -180,11 +197,13 @@ async function do_lod(state, context) {
const indices_per_thread = Math.floor(context.clipped_indices.size / state.wasm.workers.length); const indices_per_thread = Math.floor(context.clipped_indices.size / state.wasm.workers.length);
const offsets = { const offsets = {
'coords_from': buffers['coords_from'].offset, 'coords_from': buffers['coords_from'].offset,
'width': buffers['width'].offset,
'xs': buffers['xs'].offset, 'xs': buffers['xs'].offset,
'ys': buffers['ys'].offset, 'ys': buffers['ys'].offset,
'pressures': buffers['pressures'].offset, 'pressures': buffers['pressures'].offset,
'result_buffers': result_buffers, 'result_buffers': result_buffers,
'result_counts': result_counts, 'result_counts': result_counts,
'result_batch_counts': result_batch_counts,
}; };
const jobs = []; const jobs = [];
@ -209,11 +228,13 @@ async function do_lod(state, context) {
const result_offset = state.wasm.exports.merge_results( const result_offset = state.wasm.exports.merge_results(
result_counts, result_counts,
result_batch_counts,
result_buffers, result_buffers,
state.wasm.workers.length state.wasm.workers.length
); );
const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention
const batch_count = new Int32Array(state.wasm.memory.buffer, result_batch_counts, 1)[0]; // by convention
// Use results without copying from WASM memory // Use results without copying from WASM memory
const wasm_points = new Float32Array(state.wasm.memory.buffer, const wasm_points = new Float32Array(state.wasm.memory.buffer,
@ -221,16 +242,25 @@ async function do_lod(state, context) {
const wasm_ids = new Uint32Array(state.wasm.memory.buffer, const wasm_ids = new Uint32Array(state.wasm.memory.buffer,
result_offset + segment_count * 2 * 4, segment_count); result_offset + segment_count * 2 * 4, segment_count);
const wasm_pressures = new Uint8Array(state.wasm.memory.buffer, const wasm_pressures = new Uint8Array(state.wasm.memory.buffer,
result_offset + segment_count * 2 * 4 + segment_count * 4, segment_count); result_offset + segment_count * 3 * 4, segment_count);
const wasm_batches = new Int32Array(state.wasm.memory.buffer,
result_offset + round_to_pow2(segment_count * (3 * 4 + 1), 4), batch_count * 2);
context.instance_data_points.data = wasm_points; context.instance_data_points.data = wasm_points;
context.instance_data_points.size = segment_count * 2; context.instance_data_points.size = segment_count * 2;
context.instance_data_points.capacity = segment_count * 2;
context.instance_data_ids.data = wasm_ids; context.instance_data_ids.data = wasm_ids;
context.instance_data_ids.size = segment_count; context.instance_data_ids.size = segment_count;
context.instance_data_ids.capacity = segment_count;
context.instance_data_pressures.data = wasm_pressures; context.instance_data_pressures.data = wasm_pressures;
context.instance_data_pressures.size = segment_count; context.instance_data_pressures.size = segment_count;
context.instance_data_pressures.capacity = segment_count;
context.instance_data_batches.data = wasm_batches;
context.instance_data_batches.size = batch_count * 2;
context.instance_data_batches.capacity = batch_count * 2;
return segment_count; return segment_count;
} }

61
client/wasm/lod.c

@ -197,11 +197,13 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
void void
do_lod(int *clipped_indices, int clipped_count, float zoom, do_lod(int *clipped_indices, int clipped_count, float zoom,
int *stroke_coords_from, int *stroke_coords_from,
int *width,
float *xs, float *xs,
float *ys, float *ys,
unsigned char *pressures, unsigned char *pressures,
char **result_buffer, char **result_buffer,
int *result_count) int *result_count,
int *result_batch_count)
{ {
if (clipped_count == 0) { if (clipped_count == 0) {
result_count[0] = 0; result_count[0] = 0;
@ -280,18 +282,22 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
// Write actual coordinates (points) and stroke ids // Write actual coordinates (points) and stroke ids
// Do this in one allocation so that they're not interleaved between threads // Do this in one allocation so that they're not interleaved between threads
char *output = alloc_dynamic(segments_head * (3 * 4 + 1)); char *output = alloc_dynamic(round_to_pow2(segments_head * (3 * 4 + 1), 4) + clipped_count * 4 * 2); // max two ints per stroke for batch info (realistically, much less)
float *points = (float *) output; float *points = (float *) output;
int *ids = (int *) (output + segments_head * 4 * 2); int *ids = (int *) (output + segments_head * 4 * 2);
unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3); unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);
int *batches = (int *) (output + round_to_pow2(segments_head * (4 * 3 + 1), 4));
int phead = 0; int phead = 0;
int ihead = 0; int ihead = 0;
float sqrt_zoom = __builtin_sqrtf(zoom);
int last_lod = -100;
int batch_count = 0;
int batch_size = 0;
for (int i = 0; i < clipped_count; ++i) { for (int i = 0; i < clipped_count; ++i) {
int stroke_index = clipped_indices[i]; int stroke_index = clipped_indices[i];
// TODO: convert to a proper CSR, save half the memory
int base_stroke = stroke_coords_from[stroke_index]; int base_stroke = stroke_coords_from[stroke_index];
int from = segments_from[i]; int from = segments_from[i];
int to = segments_from[i + 1]; int to = segments_from[i + 1];
@ -312,35 +318,77 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
ids[ihead++] = stroke_index | (1 << 31); ids[ihead++] = stroke_index | (1 << 31);
} }
} }
int segment_count = to - from;
// Compute recommended LOD level, add to current batch or start new batch
float sqrt_width = __builtin_sqrtf(width[stroke_index]);
int lod = __builtin_ceil(sqrt_zoom * sqrt_width * 0.3333f); // TODO: round
if (lod > 7) lod = 7;
if (batch_size > 0 && __builtin_abs(lod - last_lod) > 2) {
// Start new batch
batches[batch_count * 2 + 0] = batch_size;
batches[batch_count * 2 + 1] = last_lod;
++batch_count;
batch_size = 0;
}
batch_size += segment_count;
last_lod = lod;
} }
if (batch_size > 0) {
batches[batch_count * 2 + 0] = batch_size;
batches[batch_count * 2 + 1] = last_lod;
++batch_count;
}
result_buffer[0] = output; result_buffer[0] = output;
result_count[0] = segments_head; result_count[0] = segments_head;
result_batch_count[0] = batch_count;
} }
// NOT thread-safe, only call from one thread // NOT thread-safe, only call from one thread
char * char *
merge_results(int *segment_counts, char **buffers, int nthreads) merge_results(int *segment_counts, int *batch_counts, char **buffers, int nthreads)
{ {
int total_segments = 0; int total_segments = 0;
int total_batches = 0;
for (int i = 0; i < nthreads; ++i) { for (int i = 0; i < nthreads; ++i) {
total_segments += segment_counts[i]; total_segments += segment_counts[i];
total_batches += batch_counts[i];
} }
char *merged = alloc_dynamic(total_segments * (3 * 4 + 1)); char *merged = alloc_dynamic(round_to_pow2(total_segments * (3 * 4 + 1), 4) + total_batches * 4);
float *points = (float *) merged; float *points = (float *) merged;
int *ids = (int *) (merged + total_segments * 4 * 2); int *ids = (int *) (merged + total_segments * 4 * 2);
unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3); unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);
int *batches = (int *) (merged + round_to_pow2(total_segments * (3 * 4 + 1), 4));
int batch_base = 0;
int last_batch_lod = -99;
int bhead = 0;
int written_batches = 0;
for (int i = 0; i < nthreads; ++i) { for (int i = 0; i < nthreads; ++i) {
int segments = segment_counts[i]; int segments = segment_counts[i];
int nbatches = batch_counts[i];
int *thread_batches = (int *) (buffers[i] + round_to_pow2(segments * (4 * 3 + 1), 4));
if (segments > 0) { if (segments > 0) {
__builtin_memcpy(points, buffers[i], segments * 4 * 2); __builtin_memcpy(points, buffers[i], segments * 4 * 2);
__builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4); __builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
__builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments); __builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);
for (int j = 0; j < nbatches * 2; j += 2) {
batches[bhead++] = written_batches;
batches[bhead++] = thread_batches[j + 1];
written_batches += thread_batches[j + 0];
}
points += segments * 2; points += segments * 2;
ids += segments; ids += segments;
pressures += segments; pressures += segments;
@ -348,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
} }
segment_counts[0] = total_segments; segment_counts[0] = total_segments;
batch_counts[0] = total_batches;
return(merged); return(merged);
} }

BIN
client/wasm/lod.wasm

Binary file not shown.

31
client/webgl_draw.js

@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
} }
} }
// TODO: what do we do with this // TODO: @speed we can do this once at startup
const circle_lod = Math.round(Math.min(7, 3 * Math.sqrt(state.canvas.zoom)));
const lod_levels = []; const lod_levels = [];
let total_lod_floats = 0; let total_lod_floats = 0;
let total_lod_indices = 0; let total_lod_indices = 0;
let stat_total_vertices = 0; let stat_total_vertices = 0;
for (let i = 0; i <= 7; ++i) { for (let i = 0; i <= 7; ++i) {
const d = geometry_good_circle_and_dummy(i); const d = geometry_good_circle_and_dummy(i);
lod_levels.push({ lod_levels.push({
@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
if (segment_count > 0) { if (segment_count > 0) {
const pr = programs['main']; const pr = programs['main'];
const nbatches = 10; // Last pair (lod unused) to have a proper from;to
const batches = []; tv_add2(context.instance_data_batches, segment_count);
tv_add2(context.instance_data_batches, -1);
for (let i = 0; i < nbatches; ++i) {
batches.push({
'index': Math.floor(segment_count / nbatches * i),
'lod': circle_lod,
});
if (i % 2 == 1) {
batches[batches.length - 1].lod = Math.max(0, batches[batches.length - 1].lod - 4);
}
}
batches.push({'index': segment_count, 'lod': -1}); // lod unused
gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images
gl.useProgram(pr.program); gl.useProgram(pr.program);
@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1); gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1);
gl.vertexAttribDivisor(pr.locations['a_pressure'], 1); gl.vertexAttribDivisor(pr.locations['a_pressure'], 1);
for (let b = 0; b < batches.length - 1; ++b) { for (let b = 0; b < context.instance_data_batches.size - 2; b += 2) {
const batch = batches[b]; const batch_from = context.instance_data_batches.data[b + 0];
const batch_from = batches[b].index; const batch_size = context.instance_data_batches.data[b + 2] - batch_from;
const batch_size = batches[b + 1].index - batch_from; const level = lod_levels[context.instance_data_batches.data[b + 1]];
const level = lod_levels[batch.lod];
if (batch_size > 0) { if (batch_size > 0) {
stat_total_vertices += batch_size * level.data.indices.size; stat_total_vertices += batch_size * level.data.indices.size;
@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
<span>Strokes onscreen: ${context.clipped_indices.size}</span> <span>Strokes onscreen: ${context.clipped_indices.size}</span>
<span>Segments onscreen: ${segment_count}</span> <span>Segments onscreen: ${segment_count}</span>
<span>Total vertices: ${stat_total_vertices}</span> <span>Total vertices: ${stat_total_vertices}</span>
<span>Circle LOD: ${circle_lod}</span>
<span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span> <span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span>
<span>Canvas zoom level: ${state.canvas.zoom_level}</span> <span>Canvas zoom level: ${state.canvas.zoom_level}</span>
<span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`; <span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`;

3
client/webgl_geometry.js

@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
ser_u16(context.stroke_data, b); ser_u16(context.stroke_data, b);
ser_u16(context.stroke_data, stroke.width); ser_u16(context.stroke_data, stroke.width);
tv_add(state.wasm.buffers['width'].tv, stroke.width);
state.wasm.buffers['width'].used += 4;
if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke); if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke);
} }

Loading…
Cancel
Save