Batching is close to working. Not quite though

Pass stroke widths to wasm (not actually writing any values right now,
just some busywork to allocate memory and pass pointers)
7 changed files with 102 additions and 32 deletions
--- a/client/index.js
+++ b/client/index.js
@ -227,6 +227,7 @@ async function main() {
				@@ -227,6 +227,7 @@ async function main() {
        'instance_data_points': tv_create(Float32Array, 4096),
        'instance_data_ids': tv_create(Uint32Array, 4096),
        'instance_data_pressures': tv_create(Uint8Array, 4096),
+        'instance_data_batches': tv_create(Uint32Array, 4096),
       
        'dynamic_instance_points': tv_create(Float32Array, 4096),
        'dynamic_instance_pressure': tv_create(Uint8Array, 4096),
--- a/client/lod_worker.js
+++ b/client/lod_worker.js
@ -19,11 +19,13 @@ function work(indices_base, indices_count, zoom, offsets) {
				@@ -19,11 +19,13 @@ function work(indices_base, indices_count, zoom, offsets) {
        exports.do_lod(
            indices_base, indices_count, zoom,
            offsets['coords_from'],
+            offsets['width'],
            offsets['xs'],
            offsets['ys'],
            offsets['pressures'],
            offsets['result_buffers'] + thread_id * 4,
            offsets['result_counts'] + thread_id * 4,
+            offsets['result_batch_counts'] + thread_id * 4,
        );
    } catch (e) {
        console.error('WASM:', e);
--- a/client/speed.js
+++ b/client/speed.js
@ -43,7 +43,7 @@ async function init_wasm(state) {
				@@ -43,7 +43,7 @@ async function init_wasm(state) {
        env: { 'memory': memory }
    });

-    const nworkers = navigator.hardwareConcurrency;
+    const nworkers = 1; //navigator.hardwareConcurrency;

    state.wasm.exports = master_wasm.instance.exports;
    state.wasm.heap_base = state.wasm.exports.alloc_static(0);
@ -80,12 +80,17 @@ async function init_wasm(state) {
				@@ -80,12 +80,17 @@ async function init_wasm(state) {
            'used': 0,
            'cap': initial
        },
+        'width': {
+            'used': 0,
+            'cap': initial
+        }
    };

    state.wasm.buffers['xs'].offset = state.wasm.exports.alloc_static(initial);
    state.wasm.buffers['ys'].offset = state.wasm.exports.alloc_static(initial);
    state.wasm.buffers['pressures'].offset = state.wasm.exports.alloc_static(initial);
    state.wasm.buffers['coords_from'].offset = state.wasm.exports.alloc_static(initial);
+    state.wasm.buffers['width'].offset = state.wasm.exports.alloc_static(initial);

    const mem = state.wasm.memory.buffer;

@ -97,6 +102,8 @@ async function init_wasm(state) {
				@@ -97,6 +102,8 @@ async function init_wasm(state) {
        mem, state.wasm.buffers['pressures'].offset);
    state.wasm.buffers['coords_from'].tv = tv_create_on(Uint32Array, initial / 4, 
        mem, state.wasm.buffers['coords_from'].offset);
+    state.wasm.buffers['width'].tv = tv_create_on(Uint32Array, initial / 4, 
+        mem, state.wasm.buffers['width'].offset);

    tv_add(state.wasm.buffers['coords_from'].tv, 0);
    state.wasm.buffers['coords_from'].used = 4;
@ -108,6 +115,7 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
				@@ -108,6 +115,7 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
    const old_ys_offset = buffers['ys'].offset;
    const old_coords_from_offset = buffers['coords_from'].offset;
    const old_pressures_offset = buffers['pressures'].offset;
+    const old_width_offset = buffers['width'].offset;

    let realloc = false;
    let coords_bytes = buffers['xs'].cap;
@ -135,23 +143,31 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
				@@ -135,23 +143,31 @@ function wasm_ensure_by(state, nstrokes, ncoords) {
        buffers['ys'].offset = state.wasm.exports.alloc_static(coords_bytes);
        buffers['pressures'].offset = state.wasm.exports.alloc_static(coords_bytes);
        buffers['coords_from'].offset = state.wasm.exports.alloc_static(stroke_bytes);
+        buffers['width'].offset = state.wasm.exports.alloc_static(stroke_bytes);
  
        buffers['xs'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['xs'].offset);
        buffers['ys'].tv = tv_create_on(Float32Array, coords_bytes / 4, mem, buffers['ys'].offset);
        buffers['pressures'].tv = tv_create_on(Uint8Array, coords_bytes, mem, buffers['pressures'].offset);
        buffers['coords_from'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['coords_from'].offset);
+        buffers['width'].tv = tv_create_on(Uint32Array, stroke_bytes / 4, mem, buffers['width'].offset);

        // TODO: this should have been automatic maybe?
        buffers['xs'].tv.size = buffers['xs'].used / 4;
        buffers['ys'].tv.size = buffers['ys'].used / 4;
        buffers['pressures'].tv.size = buffers['pressures'].used;
        buffers['coords_from'].tv.size = buffers['coords_from'].used / 4;
+        buffers['width'].tv.size = buffers['width'].used / 4;
+
+        // TODO: this is SUS, should all the caps really be coords_bytes?
        buffers['xs'].cap = buffers['ys'].cap = buffers['pressures'].cap = coords_bytes;
-        buffers['coords_from'].cap = stroke_bytes;
+        buffers['coords_from'].cap = buffers['width'].cap = stroke_bytes;

        const tmp = new Uint8Array(Math.max(coords_bytes, stroke_bytes));

        // Copy from back to front (otherwise we will overwrite)
+        tmp.set(new Uint8Array(mem, old_width_offset, buffers['width'].used));
+        memv.set(new Uint8Array(tmp.buffer, 0, buffers['width'].used), buffers['width'].offset);
+
        tmp.set(new Uint8Array(mem, old_coords_from_offset, buffers['coords_from'].used));
        memv.set(new Uint8Array(tmp.buffer, 0, buffers['coords_from'].used), buffers['coords_from'].offset);

@ -169,6 +185,7 @@ async function do_lod(state, context) {
				@@ -169,6 +185,7 @@ async function do_lod(state, context) {
    const buffers = state.wasm.buffers;
    const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
    const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
+    const result_batch_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
    const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4); 
    const mem = new Uint8Array(state.wasm.memory.buffer);
    
@ -180,11 +197,13 @@ async function do_lod(state, context) {
				@@ -180,11 +197,13 @@ async function do_lod(state, context) {
    const indices_per_thread = Math.floor(context.clipped_indices.size / state.wasm.workers.length);
    const offsets = {
        'coords_from': buffers['coords_from'].offset,
+        'width': buffers['width'].offset,
        'xs': buffers['xs'].offset,
        'ys': buffers['ys'].offset,
        'pressures': buffers['pressures'].offset,
        'result_buffers': result_buffers,
        'result_counts': result_counts,
+        'result_batch_counts': result_batch_counts,
    };

    const jobs = [];
@ -209,11 +228,13 @@ async function do_lod(state, context) {
				@@ -209,11 +228,13 @@ async function do_lod(state, context) {

    const result_offset = state.wasm.exports.merge_results(
        result_counts,
+        result_batch_counts,
        result_buffers,
        state.wasm.workers.length
    );

    const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention
+    const batch_count = new Int32Array(state.wasm.memory.buffer, result_batch_counts, 1)[0]; // by convention

    // Use results without copying from WASM memory 
    const wasm_points = new Float32Array(state.wasm.memory.buffer, 
@ -221,16 +242,25 @@ async function do_lod(state, context) {
				@@ -221,16 +242,25 @@ async function do_lod(state, context) {
    const wasm_ids = new Uint32Array(state.wasm.memory.buffer, 
        result_offset + segment_count * 2 * 4, segment_count);
    const wasm_pressures = new Uint8Array(state.wasm.memory.buffer,
-        result_offset + segment_count * 2 * 4 + segment_count * 4, segment_count);
+        result_offset + segment_count * 3 * 4, segment_count);
+    const wasm_batches = new Int32Array(state.wasm.memory.buffer,
+        result_offset + round_to_pow2(segment_count * (3 * 4 + 1), 4), batch_count * 2);

    context.instance_data_points.data = wasm_points;
    context.instance_data_points.size = segment_count * 2;
+    context.instance_data_points.capacity = segment_count * 2;

    context.instance_data_ids.data = wasm_ids;
    context.instance_data_ids.size = segment_count;
+    context.instance_data_ids.capacity = segment_count;

    context.instance_data_pressures.data = wasm_pressures;
    context.instance_data_pressures.size = segment_count;
+    context.instance_data_pressures.capacity = segment_count;
+
+    context.instance_data_batches.data = wasm_batches;
+    context.instance_data_batches.size = batch_count * 2;
+    context.instance_data_batches.capacity = batch_count * 2;

    return segment_count;
 }
--- a/client/wasm/lod.c
+++ b/client/wasm/lod.c
@ -197,11 +197,13 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
				@@ -197,11 +197,13 @@ rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coo
 void
 do_lod(int *clipped_indices, int clipped_count, float zoom, 
       int *stroke_coords_from,
+       int *width,
       float *xs,
       float *ys,
       unsigned char *pressures,
       char **result_buffer,
-       int *result_count)
+       int *result_count,
+       int *result_batch_count)
 {
    if (clipped_count == 0) {
        result_count[0] = 0;
@ -280,18 +282,22 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
				@@ -280,18 +282,22 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,

    // Write actual coordinates (points) and stroke ids
    // Do this in one allocation so that they're not interleaved between threads
-    char *output = alloc_dynamic(segments_head * (3 * 4 + 1));
+    char *output = alloc_dynamic(round_to_pow2(segments_head * (3 * 4 + 1), 4) + clipped_count * 4 * 2); // max two ints per stroke for batch info (realistically, much less)
    float *points = (float *) output;
    int *ids = (int *) (output + segments_head * 4 * 2);
    unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);
+    int *batches = (int *) (output + round_to_pow2(segments_head * (4 * 3 + 1), 4));

    int phead = 0;
    int ihead = 0;
+    float sqrt_zoom = __builtin_sqrtf(zoom);
+    int last_lod = -100;
+    int batch_count = 0;
+    int batch_size = 0;

    for (int i = 0; i < clipped_count; ++i) {
        int stroke_index = clipped_indices[i];
        
-        // TODO: convert to a proper CSR, save half the memory
        int base_stroke = stroke_coords_from[stroke_index];
        int from = segments_from[i];
        int to = segments_from[i + 1];
@ -312,35 +318,77 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
				@@ -312,35 +318,77 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
                ids[ihead++] = stroke_index | (1 << 31);
            }
        }
+
+        int segment_count = to - from;
+
+        // Compute recommended LOD level, add to current batch or start new batch
+        float sqrt_width = __builtin_sqrtf(width[stroke_index]); 
+        int lod = __builtin_ceil(sqrt_zoom * sqrt_width * 0.3333f); // TODO: round
+
+        if (lod > 7) lod = 7;
+
+        if (batch_size > 0 && __builtin_abs(lod - last_lod) > 2) {
+            // Start new batch
+            batches[batch_count * 2 + 0] = batch_size;
+            batches[batch_count * 2 + 1] = last_lod;
+            ++batch_count;
+            batch_size = 0;
+        }
+
+        batch_size += segment_count;
+        last_lod = lod;
+    }
+
+    if (batch_size > 0) {
+        batches[batch_count * 2 + 0] = batch_size;
+        batches[batch_count * 2 + 1] = last_lod;
+        ++batch_count;
    }

    result_buffer[0] = output;
    result_count[0] = segments_head;
+    result_batch_count[0] = batch_count;
 }

 // NOT thread-safe, only call from one thread
 char *
-merge_results(int *segment_counts, char **buffers, int nthreads)
+merge_results(int *segment_counts, int *batch_counts, char **buffers, int nthreads)
 {
    int total_segments = 0;
+    int total_batches = 0;

    for (int i = 0; i < nthreads; ++i) {
        total_segments += segment_counts[i];
+        total_batches += batch_counts[i];
    }

-    char *merged = alloc_dynamic(total_segments * (3 * 4 + 1));
+    char *merged = alloc_dynamic(round_to_pow2(total_segments * (3 * 4 + 1), 4) + total_batches * 4);
    
    float *points = (float *) merged;
    int *ids = (int *) (merged + total_segments * 4 * 2);
    unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);
+    int *batches = (int *) (merged + round_to_pow2(total_segments * (3 * 4 + 1), 4));
+    int batch_base = 0;
+    int last_batch_lod = -99;
+    int bhead = 0;
+    int written_batches = 0;

    for (int i = 0; i < nthreads; ++i) {
        int segments = segment_counts[i];
+        int nbatches = batch_counts[i];
+        int *thread_batches = (int *) (buffers[i] + round_to_pow2(segments * (4 * 3 + 1), 4));  
+        
        if (segments > 0) {
            __builtin_memcpy(points, buffers[i], segments * 4 * 2);
            __builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
            __builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);

+            for (int j = 0; j < nbatches * 2; j += 2) {
+                batches[bhead++] = written_batches;
+                batches[bhead++] = thread_batches[j + 1];
+                written_batches += thread_batches[j + 0];
+            }
+
            points += segments * 2;
            ids += segments;
            pressures += segments;
@ -348,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
				@@ -348,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
    }

    segment_counts[0] = total_segments;
+    batch_counts[0] = total_batches;

    return(merged);
 }
--- a/client/wasm/lod.wasm
+++ b/client/wasm/lod.wasm
--- a/client/webgl_draw.js
+++ b/client/webgl_draw.js
@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
				@@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
        }
    }

-    // TODO: what do we do with this
-    const circle_lod = Math.round(Math.min(7, 3 * Math.sqrt(state.canvas.zoom)));
+    // TODO: @speed we can do this once at startup
    const lod_levels = [];
    let total_lod_floats = 0;
    let total_lod_indices = 0;
    let stat_total_vertices = 0;
-
    for (let i = 0; i <= 7; ++i) {
        const d = geometry_good_circle_and_dummy(i);
        lod_levels.push({
@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
				@@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
    if (segment_count > 0) {
        const pr = programs['main'];

-        const nbatches = 10;
-        const batches = [];
-
-        for (let i = 0; i < nbatches; ++i) {
-            batches.push({
-                'index': Math.floor(segment_count / nbatches * i),
-                'lod': circle_lod,
-            });
-
-            if (i % 2 == 1) {
-                batches[batches.length - 1].lod = Math.max(0, batches[batches.length - 1].lod - 4);
-            }
-        }
-        batches.push({'index': segment_count, 'lod': -1}); // lod unused
+        // Last pair (lod unused) to have a proper from;to
+        tv_add2(context.instance_data_batches, segment_count);
+        tv_add2(context.instance_data_batches, -1);

        gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images 
        gl.useProgram(pr.program);
@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
				@@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
        gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1);
        gl.vertexAttribDivisor(pr.locations['a_pressure'], 1);

-        for (let b = 0; b < batches.length - 1; ++b) {
-            const batch = batches[b];
-            const batch_from = batches[b].index;
-            const batch_size = batches[b + 1].index - batch_from;
-            const level = lod_levels[batch.lod];
+        for (let b = 0; b < context.instance_data_batches.size - 2; b += 2) {
+            const batch_from = context.instance_data_batches.data[b + 0];
+            const batch_size = context.instance_data_batches.data[b + 2] - batch_from;
+            const level = lod_levels[context.instance_data_batches.data[b + 1]];

            if (batch_size > 0) {
                stat_total_vertices += batch_size * level.data.indices.size;
@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
				@@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
    <span>Strokes onscreen: ${context.clipped_indices.size}</span>
    <span>Segments onscreen: ${segment_count}</span>
    <span>Total vertices: ${stat_total_vertices}</span>
-    <span>Circle LOD: ${circle_lod}</span>
    <span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span>
    <span>Canvas zoom level: ${state.canvas.zoom_level}</span>
    <span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`;
--- a/client/webgl_geometry.js
+++ b/client/webgl_geometry.js
@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
				@@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
    ser_u16(context.stroke_data, b);
    ser_u16(context.stroke_data, stroke.width);

+    tv_add(state.wasm.buffers['width'].tv, stroke.width);
+    state.wasm.buffers['width'].used += 4;
+
    if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke);
 }
Author	SHA1	Message	Date
A.Olokhtonov	4a6715ef66	Batching is close to working. Not quite though	11 months ago
A.Olokhtonov	ce824a8e31	Pass stroke widths to wasm (not actually writing any values right now, just some busywork to allocate memory and pass pointers)	12 months ago