Batching is close to working. Not quite though

Also possibly fixed a very nasty bug, but probably not
1 year ago · 777772530f
7 changed files with 74 additions and 39 deletions
--- a/client/index.js
+++ b/client/index.js
@ -227,6 +227,7 @@ async function main() {
				@@ -227,6 +227,7 @@ async function main() {
        'instance_data_points': tv_create(Float32Array, 4096),
        'instance_data_ids': tv_create(Uint32Array, 4096),
        'instance_data_pressures': tv_create(Uint8Array, 4096),
+        'instance_data_batches': tv_create(Uint32Array, 4096),
       
        'dynamic_instance_points': tv_create(Float32Array, 4096),
        'dynamic_instance_pressure': tv_create(Uint8Array, 4096),
--- a/client/lod_worker.js
+++ b/client/lod_worker.js
@ -25,6 +25,7 @@ function work(indices_base, indices_count, zoom, offsets) {
				@@ -25,6 +25,7 @@ function work(indices_base, indices_count, zoom, offsets) {
            offsets['pressures'],
            offsets['result_buffers'] + thread_id * 4,
            offsets['result_counts'] + thread_id * 4,
+            offsets['result_batch_counts'] + thread_id * 4,
        );
    } catch (e) {
        console.error('WASM:', e);
--- a/client/speed.js
+++ b/client/speed.js
@ -43,7 +43,7 @@ async function init_wasm(state) {
				@@ -43,7 +43,7 @@ async function init_wasm(state) {
        env: { 'memory': memory }
    });

-    const nworkers = navigator.hardwareConcurrency;
+    const nworkers = 1; //navigator.hardwareConcurrency;

    state.wasm.exports = master_wasm.instance.exports;
    state.wasm.heap_base = state.wasm.exports.alloc_static(0);
@ -185,6 +185,7 @@ async function do_lod(state, context) {
				@@ -185,6 +185,7 @@ async function do_lod(state, context) {
    const buffers = state.wasm.buffers;
    const result_buffers = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
    const result_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
+    const result_batch_counts = state.wasm.exports.alloc_dynamic(state.wasm.workers.length * 4);
    const clipped_indices = state.wasm.exports.alloc_dynamic(context.clipped_indices.size * 4); 
    const mem = new Uint8Array(state.wasm.memory.buffer);
    
@ -202,6 +203,7 @@ async function do_lod(state, context) {
				@@ -202,6 +203,7 @@ async function do_lod(state, context) {
        'pressures': buffers['pressures'].offset,
        'result_buffers': result_buffers,
        'result_counts': result_counts,
+        'result_batch_counts': result_batch_counts,
    };

    const jobs = [];
@ -226,11 +228,13 @@ async function do_lod(state, context) {
				@@ -226,11 +228,13 @@ async function do_lod(state, context) {

    const result_offset = state.wasm.exports.merge_results(
        result_counts,
+        result_batch_counts,
        result_buffers,
        state.wasm.workers.length
    );

    const segment_count = new Int32Array(state.wasm.memory.buffer, result_counts, 1)[0]; // by convention
+    const batch_count = new Int32Array(state.wasm.memory.buffer, result_batch_counts, 1)[0]; // by convention

    // Use results without copying from WASM memory 
    const wasm_points = new Float32Array(state.wasm.memory.buffer, 
@ -238,16 +242,25 @@ async function do_lod(state, context) {
				@@ -238,16 +242,25 @@ async function do_lod(state, context) {
    const wasm_ids = new Uint32Array(state.wasm.memory.buffer, 
        result_offset + segment_count * 2 * 4, segment_count);
    const wasm_pressures = new Uint8Array(state.wasm.memory.buffer,
-        result_offset + segment_count * 2 * 4 + segment_count * 4, segment_count);
+        result_offset + segment_count * 3 * 4, segment_count);
+    const wasm_batches = new Int32Array(state.wasm.memory.buffer,
+        result_offset + round_to_pow2(segment_count * (3 * 4 + 1), 4), batch_count * 2);

    context.instance_data_points.data = wasm_points;
    context.instance_data_points.size = segment_count * 2;
+    context.instance_data_points.capacity = segment_count * 2;

    context.instance_data_ids.data = wasm_ids;
    context.instance_data_ids.size = segment_count;
+    context.instance_data_ids.capacity = segment_count;

    context.instance_data_pressures.data = wasm_pressures;
    context.instance_data_pressures.size = segment_count;
+    context.instance_data_pressures.capacity = segment_count;
+
+    context.instance_data_batches.data = wasm_batches;
+    context.instance_data_batches.size = batch_count * 2;
+    context.instance_data_batches.capacity = batch_count * 2;

    return segment_count;
 }
--- a/client/wasm/lod.c
+++ b/client/wasm/lod.c
@ -202,7 +202,8 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
				@@ -202,7 +202,8 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
       float *ys,
       unsigned char *pressures,
       char **result_buffer,
-       int *result_count)
+       int *result_count,
+       int *result_batch_count)
 {
    if (clipped_count == 0) {
        result_count[0] = 0;
@ -281,16 +282,18 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
				@@ -281,16 +282,18 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,

    // Write actual coordinates (points) and stroke ids
    // Do this in one allocation so that they're not interleaved between threads
-    char *output = alloc_dynamic(segments_head * (3 * 4 + 1) + clipped_count * 4);
+    char *output = alloc_dynamic(round_to_pow2(segments_head * (3 * 4 + 1), 4) + clipped_count * 4 * 2); // max two ints per stroke for batch info (realistically, much less)
    float *points = (float *) output;
    int *ids = (int *) (output + segments_head * 4 * 2);
    unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);
-    unsigned int *batches = (unsigned int *) (output + segments_head * (4 * 3 + 1));
+    int *batches = (int *) (output + round_to_pow2(segments_head * (4 * 3 + 1), 4));

    int phead = 0;
    int ihead = 0;
    float sqrt_zoom = __builtin_sqrtf(zoom);
-    int last_lod = -1;
+    int last_lod = -100;
+    int batch_count = 0;
+    int batch_size = 0;

    for (int i = 0; i < clipped_count; ++i) {
        int stroke_index = clipped_indices[i];
@ -316,48 +319,76 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
				@@ -316,48 +319,76 @@ do_lod(int *clipped_indices, int clipped_count, float zoom,
            }
        }

+        int segment_count = to - from;
+
        // Compute recommended LOD level, add to current batch or start new batch
-        float sqrt_width = __builtin_sqrtf(width[stroke_index]); // TOOD: pass in stroke width
-        int lod = __builtin_round(sqrt_zoom * sqrt_width * 0.3333f);
+        float sqrt_width = __builtin_sqrtf(width[stroke_index]); 
+        int lod = __builtin_ceil(sqrt_zoom * sqrt_width * 0.3333f); // TODO: round

-#if 0
-        if (__builtin_abs(lod - last_lod) > 2) {
+        if (lod > 7) lod = 7;
+
+        if (batch_size > 0 && __builtin_abs(lod - last_lod) > 2) {
            // Start new batch
-        } else {
-            // Add to existing batch
+            batches[batch_count * 2 + 0] = batch_size;
+            batches[batch_count * 2 + 1] = last_lod;
+            ++batch_count;
+            batch_size = 0;
        }

+        batch_size += segment_count;
        last_lod = lod;
-#endif
    }
-    
+
+    if (batch_size > 0) {
+        batches[batch_count * 2 + 0] = batch_size;
+        batches[batch_count * 2 + 1] = last_lod;
+        ++batch_count;
+    }
+
    result_buffer[0] = output;
    result_count[0] = segments_head;
+    result_batch_count[0] = batch_count;
 }

 // NOT thread-safe, only call from one thread
 char *
-merge_results(int *segment_counts, char **buffers, int nthreads)
+merge_results(int *segment_counts, int *batch_counts, char **buffers, int nthreads)
 {
    int total_segments = 0;
+    int total_batches = 0;

    for (int i = 0; i < nthreads; ++i) {
        total_segments += segment_counts[i];
+        total_batches += batch_counts[i];
    }

-    char *merged = alloc_dynamic(total_segments * (3 * 4 + 1));
+    char *merged = alloc_dynamic(round_to_pow2(total_segments * (3 * 4 + 1), 4) + total_batches * 4);
    
    float *points = (float *) merged;
    int *ids = (int *) (merged + total_segments * 4 * 2);
    unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);
+    int *batches = (int *) (merged + round_to_pow2(total_segments * (3 * 4 + 1), 4));
+    int batch_base = 0;
+    int last_batch_lod = -99;
+    int bhead = 0;
+    int written_batches = 0;

    for (int i = 0; i < nthreads; ++i) {
        int segments = segment_counts[i];
+        int nbatches = batch_counts[i];
+        int *thread_batches = (int *) (buffers[i] + round_to_pow2(segments * (4 * 3 + 1), 4));  
+        
        if (segments > 0) {
            __builtin_memcpy(points, buffers[i], segments * 4 * 2);
            __builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
            __builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);

+            for (int j = 0; j < nbatches * 2; j += 2) {
+                batches[bhead++] = written_batches;
+                batches[bhead++] = thread_batches[j + 1];
+                written_batches += thread_batches[j + 0];
+            }
+
            points += segments * 2;
            ids += segments;
            pressures += segments;
@ -365,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
				@@ -365,6 +396,7 @@ merge_results(int *segment_counts, char **buffers, int nthreads)
    }

    segment_counts[0] = total_segments;
+    batch_counts[0] = total_batches;

    return(merged);
 }
--- a/client/wasm/lod.wasm
+++ b/client/wasm/lod.wasm
--- a/client/webgl_draw.js
+++ b/client/webgl_draw.js
@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
				@@ -273,13 +273,11 @@ async function draw(state, context, animate, ts) {
        }
    }

-    // TODO: what do we do with this
-    const circle_lod = Math.round(Math.min(7, 3 * Math.sqrt(state.canvas.zoom)));
+    // TODO: @speed we can do this once at startup
    const lod_levels = [];
    let total_lod_floats = 0;
    let total_lod_indices = 0;
    let stat_total_vertices = 0;
-
    for (let i = 0; i <= 7; ++i) {
        const d = geometry_good_circle_and_dummy(i);
        lod_levels.push({
@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
				@@ -295,20 +293,9 @@ async function draw(state, context, animate, ts) {
    if (segment_count > 0) {
        const pr = programs['main'];

-        const nbatches = 10;
-        const batches = [];
-
-        for (let i = 0; i < nbatches; ++i) {
-            batches.push({
-                'index': Math.floor(segment_count / nbatches * i),
-                'lod': circle_lod,
-            });
-
-            if (i % 2 == 1) {
-                batches[batches.length - 1].lod = Math.max(0, batches[batches.length - 1].lod);
-            }
-        }
-        batches.push({'index': segment_count, 'lod': -1}); // lod unused
+        // Last pair (lod unused) to have a proper from;to
+        tv_add2(context.instance_data_batches, segment_count);
+        tv_add2(context.instance_data_batches, -1);

        gl.clear(gl.DEPTH_BUFFER_BIT); // draw strokes above the images 
        gl.useProgram(pr.program);
@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
				@@ -363,11 +350,10 @@ async function draw(state, context, animate, ts) {
        gl.vertexAttribDivisor(pr.locations['a_stroke_id'], 1);
        gl.vertexAttribDivisor(pr.locations['a_pressure'], 1);

-        for (let b = 0; b < batches.length - 1; ++b) {
-            const batch = batches[b];
-            const batch_from = batches[b].index;
-            const batch_size = batches[b + 1].index - batch_from;
-            const level = lod_levels[batch.lod];
+        for (let b = 0; b < context.instance_data_batches.size - 2; b += 2) {
+            const batch_from = context.instance_data_batches.data[b + 0];
+            const batch_size = context.instance_data_batches.data[b + 2] - batch_from;
+            const level = lod_levels[context.instance_data_batches.data[b + 1]];

            if (batch_size > 0) {
                stat_total_vertices += batch_size * level.data.indices.size;
@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
				@@ -593,7 +579,6 @@ async function draw(state, context, animate, ts) {
    <span>Strokes onscreen: ${context.clipped_indices.size}</span>
    <span>Segments onscreen: ${segment_count}</span>
    <span>Total vertices: ${stat_total_vertices}</span>
-    <span>Circle LOD: ${circle_lod}</span>
    <span>Canvas offset: (${Math.round(state.canvas.offset.x * 100) / 100}, ${Math.round(state.canvas.offset.y * 100) / 100})</span>
    <span>Canvas zoom level: ${state.canvas.zoom_level}</span>
    <span>Canvas zoom: ${Math.round(state.canvas.zoom * 100) / 100}</span>`;
--- a/client/webgl_geometry.js
+++ b/client/webgl_geometry.js
@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
				@@ -58,6 +58,9 @@ function geometry_add_stroke(state, context, stroke, stroke_index, skip_bvh = fa
    ser_u16(context.stroke_data, b);
    ser_u16(context.stroke_data, stroke.width);

+    tv_add(state.wasm.buffers['width'].tv, stroke.width);
+    state.wasm.buffers['width'].used += 4;
+
    if (!skip_bvh) bvh_add_stroke(state, state.bvh, stroke_index, stroke);
 }