Compare commits

..

2 Commits

Author SHA1 Message Date
A.Olokhtonov 4a6715ef66 Batching is close to working. Not quite though 10 months ago
A.Olokhtonov ce824a8e31 Pass stroke widths to wasm (not actually writing any values right now, 10 months ago
  1. 1
      client/index.js
  2. 2
      client/lod_worker.js
  3. 36
      client/speed.js
  4. 59
      client/wasm/lod.c
  5. BIN
      client/wasm/lod.wasm
  6. 31
      client/webgl_draw.js
  7. 3
      client/webgl_geometry.js

1
client/index.js

@ -227,6 +227,7 @@ async function main() { @@ -227,6 +227,7 @@ async function main() {
'instance_data_points': tv_create(Float32Array, 4096),
'instance_data_ids': tv_create(Uint32Array, 4096),
'instance_data_pressures': tv_create(Uint8Array, 4096),
'instance_data_batches': tv_create(Uint32Array, 4096),
'dynamic_instance_points': tv_create(Float32Array, 4096),
'dynamic_instance_pressure': tv_create(Uint8Array, 4096),

2
client/lod_worker.js

@ -19,11 +19,13 @@ function work(indices_base, indices_count, zoom, offsets) { @@ -19,11 +19,13 @@ function work(indices_base, indices_count, zoom, offsets) {
exports.do_lod(
indices_base, indices_count, zoom,
offsets['coords_from'],
offsets['width'],
offsets['xs'],
offsets['ys'],
offsets['pressures'],
offsets['result_buffers'] + thread_id * 4,
offsets['result_counts'] + thread_id * 4,
offsets['result_batch_counts'] + thread_id * 4,
);
} catch (e) {
console.error('WASM:', e);

36
client/speed.js

@ -43,7 +43,7 @@ async function init_wasm(state) { @@ -43,7 +43,7 @@ async function init_wasm(state) {
env: { 'memory': memory }
});
const nworkers = navigator.hardwareConcurrency;
const nworkers = 1; //navigator.hardwareConcurrency;
state.wasm.exports = master_wasm.instance.exports;
state.wasm.heap_base = state.wasm.exports.alloc_static(0);
@ -80,12 +80,17 @@ async function init_wasm(state) { @@ -80,12 +80,17 @@ async function init_wasm(state) {
'used': 0,
'cap': initial
},
'width': {
'used': 0,
'cap': initial
}
};
state.wasm.buffers['xs'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['ys'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['pressures'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['coords_from'].offset = state.wasm.exports.alloc_static(initial);
state.wasm.buffers['width'].offset = state.wasm.exports.alloc_static(initial);
const mem = state.wasm.memory.buffer;
@ -97,6 +102,8 @@ async function init_wasm(state) { @@ -97,6 +102,8 @@ async function init_wasm(state) {
mem, state.wasm.buffers['pressures'].offset);
state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, initial / 4,
mem, state.wasm.buffers['coords_from'].offset);
state.wasm.buffers['width'].tv = tv_create_on(Uint32Array, initial / 4,
mem, state.wasm.buffers['width'].offset);
tv_add(state.wasm.buffers['coords_from'].tv, 0);
state.wasm.buffers['coords_from'].used = 4;
@ -108,6 +115,7 @@ function wasm_ensure_by(state, nstrokes, ncoords) { @@ -108,6 +115,7 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
const old_ys_offset = buffers['ys'].offset;
const old_coords_from_offset = buffers['coords_from'].offset;
const old_pressures_offset = buffers['pressures'].offset;
const old_width_offset = buffers['width'].offset;
let realloc = false;
let coords_bytes = buffers['xs'].cap;
@ -135,23 +143,31 @@ function wasm_ensure_by(state, nstrokes, ncoords) { @@ -135,23 +143,31 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
buffers['ys'].offset = state.wasm.exports.alloc_static(coords_bytes);
buffers['pressures'].offset = state.wasm.exports.alloc_static(coords_bytes);
buffers['coords_from'].offset = state.wasm.exports.alloc_static(stroke_bytes);
buffers['width'].offset = state.wasm.exports.alloc_static(stroke_bytes);
buffers['xs'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['xs'].offset);
buffers['ys'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['ys'].offset);
buffers['pressures'].tv = tv_create_on(Uint8Array, coords_bytes, mem, buffers['pressures'].offset);
buffers['coords_from'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['coords_from'].offset);
buffers['width'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['width'].offset);
// TODO: this should have been automatic maybe?
buffers['xs'].tv.size = buffers['xs'].used / 4;
buffers['ys'].tv.size = buffers['ys'].used / 4;
buffers['pressures'].tv.size = buffers['pressures'].used;
buffers['coords_from'].tv.size = buffers['coords_from'].used / 4;
buffers['width'].tv.size = buffers['width'].used / 4;
// TODO: this is SUS, should all the caps really be coords_bytes?
buffers['xs'].cap = buffers['ys'].cap = buffers['pressures'].cap = coords_bytes;
buffers['coords_from'].cap = stroke_bytes;
buffers['coords_from'].cap = buffers['width'].cap = stroke_bytes;
const tmp = new Uint8Array(Math.max(coords_bytes, stroke_bytes));
// Copy from back to front (otherwise we will overwrite)
tmp.set(new Uint8Array(mem, old_width_offset, buffers['width'].used));
memv.set(new Uint8Array(tmp.buffer, 0, buffers['width'].used), buffers['width'].offset);
tmp.set(new Uint8Array(mem, old_coords_from_offset, buffers['coords_from'].used));
memv.set(new Uint8Array(tmp.buffer, 0, buffers['coords_from'].used), buffers['coords_from'].offset);
@ -169,6 +185,7 @@ async function do_lod(state, context) { @@ -169,6 +185,7 @@ async function do_lod(state, context) {
const buffers = state.wasm.buffers;
const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const result_batch_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4);
const mem = new Uint8Array(state.wasm.memory.buffer);
@ -180,11 +197,13 @@ async function do_lod(state, context) { @@ -180,11 +197,13 @@ async function do_lod(state, context) {
const indices_per_thread = Math.floor(context.clipped_indices.size / state.wasm.workers.length);
const offsets = {
'coords_from': buffers['coords_from'].offset,
'width': buffers['width'].offset,
'xs': buffers['xs'].offset,
'ys': buffers['ys'].offset,
'pressures': buffers['pressures'].offset,
'result_buffers': result_buffers,
'result_counts': result_counts,
'result_batch_counts': result_batch_counts,
};
const jobs = [];
@ -209,11 +228,13 @@ async function do_lod(state, context) { @@ -209,11 +228,13 @@ async function do_lod(state, context) {
const result_offset = state.wasm.exports.merge_results(
result_counts,
result_batch_counts,
result_buffers,
state.wasm.workers.length
);
const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention
const batch_count = new Int32Array(state.wasm.memory.buffer, result_batch_counts, 1)[0]; // by convention
// Use results without copying from WASM memory
const wasm_points = new Float32Array(state.wasm.memory.buffer,
@ -221,16 +242,25 @@ async function do_lod(state, context) { @@ -221,16 +242,25 @@ async function do_lod(state, context) {
const wasm_ids = new Uint32Array(state.wasm.memory.buffer,
result_offset + segment_count * 2 * 4, segment_count);
const wasm_pressures = new Uint8Array(state.wasm.memory.buffer,
result_offset + segment_count * 2 * 4 + segment_count * 4, segment_count);
result_offset + segment_count * 3 * 4, segment_count);
const wasm_batches = new Int32Array(state.wasm.memory.buffer,
result_offset + round_to_pow2(segment_count * (3 * 4 + 1), 4), batch_count * 2);
context.instance_data_points.data = wasm_points;
context.instance_data_points.size = segment_count * 2;
context.instance_data_points.capacity = segment_count * 2;
context.instance_data_ids.data = wasm_ids;
context.instance_data_ids.size = segment_count;
context.instance_data_ids.capacity = segment_count;
context.instance_data_pressures.data = wasm_pressures;
context.instance_data_pressures.size = segment_count;
context.instance_data_pressures.capacity = segment_count;
context.instance_data_batches.data = wasm_batches;
context.instance_data_batches.size = batch_count * 2;
context.instance_data_batches.capacity = batch_count * 2;
return segment_count;
}

59
client/wasm/lod.c

@ -197,11 +197,13 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo @@ -197,11 +197,13 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
void
do_lod(int *clipped_indices, int clipped_count, float zoom,
int *stroke_coords_from,
int *width,
float *xs,
float *ys,
unsigned char *pressures,
char **result_buffer,
int *result_count)
int *result_count,
int *result_batch_count)
{
if (clipped_count == 0) {
result_count[0] = 0;
@ -280,18 +282,22 @@ do_lod(int *clipped_indices, int clipped_count, float zoom, @@ -280,18 +282,22 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
// Write actual coordinates (points) and stroke ids
// Do this in one allocation so that they're not interleaved between threads
char *output = alloc_dynamic(segments_head * (3 * 4 + 1));
char *output = alloc_dynamic(round_to_pow2(segments_head * (3 * 4 + 1), 4) + clipped_count * 4 * 2); // max two ints per stroke for batch info (realistically, much less)
float *points = (float *) output;
int *ids = (int *) (output + segments_head * 4 * 2);
unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);
int *batches = (int *) (output + round_to_pow2(segments_head * (4 * 3 + 1), 4));
int phead = 0;
int ihead = 0;
float sqrt_zoom = __builtin_sqrtf(zoom);
int last_lod = -100;
int batch_count = 0;
int batch_size = 0;
for (int i = 0; i < clipped_count; ++i) {
int stroke_index = clipped_indices[i];
// TODO: convert to a proper CSR, save half the memory
int base_stroke = stroke_coords_from[stroke_index];
int from = segments_from[i];
int to = segments_from[i + 1];
@ -312,35 +318,77 @@ do_lod(int *clipped_indices, int clipped_count, float zoom, @@ -312,35 +318,77 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
ids[ihead++] = stroke_index | (1 << 31);
}
}
int segment_count = to - from;
// Compute recommended LOD level, add to current batch or start new batch
float sqrt_width = __builtin_sqrtf(width[stroke_index]);
int lod = __builtin_ceil(sqrt_zoom * sqrt_width * 0.3333f); // TODO: round
if (lod > 7) lod = 7;
if (batch_size > 0 && __builtin_abs(lod - last_lod) > 2) {
// Start new batch
batches[batch_count * 2 + 0] = batch_size;
batches[batch_count * 2 + 1] = last_lod;
++batch_count;
batch_size = 0;
}
batch_size += segment_count;
last_lod = lod;
}
if (batch_size > 0) {
batches[batch_count * 2 + 0] = batch_size;
batches[batch_count * 2 + 1] = last_lod;
++batch_count;
}
result_buffer[0] = output;
result_count[0] = segments_head;
result_batch_count[0] = batch_count;
}
// NOT thread-safe, only call from one thread
char *
merge_results(int *segment_counts, char **buffers, int nthreads)
merge_results(int *segment_counts, int *batch_counts, char **buffers, int nthreads)
{
int total_segments = 0;
int total_batches = 0;
for (int i = 0; i < nthreads; ++i) {
total_segments += segment_counts[i];
total_batches += batch_counts[i];
}
char *merged = alloc_dynamic(total_segments * (3 * 4 + 1));
char *merged = alloc_dynamic(round_to_pow2(total_segments * (3 * 4 + 1), 4) + total_batches * 4);
float *points = (float *) merged;
int *ids = (int *) (merged + total_segments * 4 * 2);
unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);
int *batches = (int *) (merged + round_to_pow2(total_segments * (3 * 4 + 1), 4));
int batch_base = 0;
int last_batch_lod = -99;
int bhead = 0;
int written_batches = 0;
for (int i = 0; i < nthreads; ++i) {
int segments = segment_counts[i];
int nbatches = batch_counts[i];
int *thread_batches = (int *) (buffers[i] + round_to_pow2(segments * (4 * 3 + 1), 4));
if (segments > 0) {
__builtin_memcpy(points, buffers[i], segments * 4 * 2);
__builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
__builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);
for (int j = 0; j < nbatches * 2; j += 2) {
batches[bhead++] = written_batches;
batches[bhead++] = thread_batches[j + 1];
written_batches += thread_batches[j + 0];
}
points += segments * 2;
ids += segments;
pressures += segments;
@ -348,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads) @@ -348,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
}
segment_counts[0] = total_segments;
batch_counts[0] = total_batches;
return(merged);
}

BIN
client/wasm/lod.wasm

Binary file not shown.

31
client/webgl_draw.js

@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) { @@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
}
}
// TODO: what do we do with this
const circle_lod = Math.round(Math.min(7, 3 * Math.sqrt(state.canvas.zoom)));
// TODO: @speed we can do this once at startup
const lod_levels = [];
let total_lod_floats = 0;
let total_lod_indices = 0;
let stat_total_vertices = 0;
for (let i = 0; i <= 7; ++i) {
const d = geometry_good_circle_and_dummy(i);
lod_levels.push({
@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) { @@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
if (segment_count > 0) {
const pr = programs['main'];
const nbatches = 10;
const batches = [];
for (let i = 0; i < nbatches; ++i) {
batches.push({
'index': Math.floor(segment_count / nbatches * i),
'lod': circle_lod,
});
if (i % 2 == 1) {
batches[batches.length - 1].lod = Math.max(0, batches[batches.length - 1].lod - 4);
}
}
batches.push({'index': segment_count, 'lod': -1}); // lod unused
// Last pair (lod unused) to have a proper from;to
tv_add2(context.instance_data_batches, segment_count);
tv_add2(context.instance_data_batches, -1);
gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images
gl.useProgram(pr.program);
@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) { @@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1);
gl.vertexAttribDivisor(pr.locations['a_pressure'], 1);
for (let b = 0; b < batches.length - 1; ++b) {
const batch = batches[b];
const batch_from = batches[b].index;
const batch_size = batches[b + 1].index - batch_from;
const level = lod_levels[batch.lod];
for (let b = 0; b < context.instance_data_batches.size - 2; b += 2) {
const batch_from = context.instance_data_batches.data[b + 0];
const batch_size = context.instance_data_batches.data[b + 2] - batch_from;
const level = lod_levels[context.instance_data_batches.data[b + 1]];
if (batch_size > 0) {
stat_total_vertices += batch_size * level.data.indices.size;
@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) { @@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
<span>Strokes onscreen: ${context.clipped_indices.size}</span>
<span>Segments onscreen: ${segment_count}</span>
<span>Total vertices: ${stat_total_vertices}</span>
<span>Circle LOD: ${circle_lod}</span>
<span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span>
<span>Canvas zoom level: ${state.canvas.zoom_level}</span>
<span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`;

3
client/webgl_geometry.js

@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa @@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
ser_u16(context.stroke_data, b);
ser_u16(context.stroke_data, stroke.width);
tv_add(state.wasm.buffers['width'].tv, stroke.width);
state.wasm.buffers['width'].used += 4;
if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke);
}

Loading…
Cancel
Save