desk2/client/wasm/lod.c

// clang -Oz --target=wasm32 -nostdlib -Wl,--export-all,--no-entry -msimd128 lod.c -o lod.wasm

#include <wasm_simd128.h>

extern char __heap_base;

static int allocated_static;
static int allocated_dynamic;

void
set_sp(char *sp)
{
    __asm__ __volatile__(
        ".globaltype __stack_pointer, i32\n"
        "local.get %0\n"
        "global.set __stack_pointer\n"
        : : "r"(sp)
    );
}

void
free_static(void)
{
    allocated_static = 0;
}

void
free_dynamic(void)
{
    allocated_dynamic = 0;
}

void *
alloc_static(int size)
{
    // This IS NOT thread-safe
    void *result = &__heap_base + allocated_static;
    allocated_static += size;
    return(result);
}

static int
round_to_pow2(int value, int multiple)
{
    return((value + multiple - 1) & -multiple);
}

void *
alloc_dynamic(int size)
{
    // Very ad-van-ced thread-safe allocator
    // CAN be called from multiple threads
    size = round_to_pow2(size, 4);
    int original_allocated_dynamic = __atomic_fetch_add(&allocated_dynamic, size, __ATOMIC_SEQ_CST);
    void *result = &__heap_base + allocated_static + original_allocated_dynamic;
    return(result);
}

static int
rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from, 
             int segment_start, int segment_end)
{
    int result = -1;

    if (segment_start == segment_end) {
        return(result);
    }

    float EPS = 0.125f / zoom * 255.0f;
    float max_dist = 0.0f;

    float ax = xs[coords_from + segment_start];
    float ay = ys[coords_from + segment_start];
    float bx = xs[coords_from + segment_end];
    float by = ys[coords_from + segment_end];

    unsigned char ap = pressures[coords_from / 2 + segment_start];
    unsigned char bp = pressures[coords_from / 2 + segment_end];

    float dx = bx - ax;
    float dy = by - ay;
    
    float dist_ab = __builtin_sqrtf(dx * dx + dy * dy);
    float dir_nx = dy / dist_ab * 255.0f;
    float dir_ny = -dx / dist_ab * 255.0f;

#if 0
        for (int i = segment_start + 1; i < segment_end; ++i) {
            float px = xs[coords_from + i];
            float py = ys[coords_from + i];

            unsigned char pp = pressures[coords_from + i];

            float apx = px - ax;
            float apy = py - ay;

            float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
                + __builtin_abs(pp - ap) + __builtin_abs(pp - bp);

            if (dist > EPS && dist > max_dist) {
                result = i;
                max_dist = dist;
            }
        }
#else
    v128_t ax_x4 = wasm_f32x4_splat(ax);
    v128_t ay_x4 = wasm_f32x4_splat(ay);
    v128_t ap_x4 = wasm_f32x4_splat(ap);
    v128_t bp_x4 = wasm_f32x4_splat(bp);
    v128_t dir_nx_x4 = wasm_f32x4_splat(dir_nx);
    v128_t dir_ny_x4 = wasm_f32x4_splat(dir_ny);

    v128_t index_x4 = wasm_u32x4_make(segment_start + 1, segment_start + 2, segment_start + 3, segment_start + 4);
    v128_t four_x4 = wasm_u32x4_const_splat(4);
    v128_t max_dist_x4 = wasm_f32x4_splat(EPS);
    v128_t max_index_x4 = wasm_u32x4_const_splat(-1);

    for (int i = segment_start + 1; i < segment_end - 3; i += 4) {
        v128_t px_x4 = wasm_v128_load(xs + coords_from + i);
        v128_t py_x4 = wasm_v128_load(ys + coords_from + i);

        v128_t pp_x4 = wasm_f32x4_make(
            pressures[coords_from / 2 + i + 0],
            pressures[coords_from / 2 + i + 1],
            pressures[coords_from / 2 + i + 2],
            pressures[coords_from / 2 + i + 3]
        );

        v128_t apx_x4 = wasm_f32x4_sub(px_x4, ax_x4);
        v128_t apy_x4 = wasm_f32x4_sub(py_x4, ay_x4);

        v128_t dist_x4 = wasm_f32x4_add(
            wasm_f32x4_add(
                wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, ap_x4)),
                wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, bp_x4))
            ),
            wasm_f32x4_abs(
                wasm_f32x4_add(
                    wasm_f32x4_mul(apx_x4, dir_nx_x4),
                    wasm_f32x4_mul(apy_x4, dir_ny_x4)
                )
            )
        );
        
        v128_t mask = wasm_f32x4_gt(dist_x4, max_dist_x4);

        max_index_x4 = wasm_v128_bitselect(index_x4, max_index_x4, mask);
        max_dist_x4 = wasm_v128_bitselect(dist_x4, max_dist_x4, mask);
        
        index_x4 = wasm_i32x4_add(index_x4, four_x4);
    }

    int indices[4];
    float values[4];

    wasm_v128_store(indices, max_index_x4);
    wasm_v128_store(values, max_dist_x4);

    for (int i = 0; i < 4; ++i) {
        if (indices[i] != -1) {
            if (values[i] > max_dist) {
                result = indices[i];
                max_dist = values[i];
            }
        }
    }

    if (max_dist == EPS) {
        max_dist = 0.0f;
        result = -1;
    }

    int remainder = (segment_end - segment_start - 1) % 4;

    for (int i = segment_end - remainder; i < segment_end; ++i) {
        float px = xs[coords_from + i];
        float py = ys[coords_from + i];

        unsigned char pp = pressures[coords_from + i];

        float apx = px - ax;
        float apy = py - ay;

        float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
            + __builtin_abs(pp - ap) + __builtin_abs(pp - bp);

        if (dist > EPS && dist > max_dist) {
            result = i;
            max_dist = dist;
        }
    }
#endif

    return(result);
}

void
do_lod(int *clipped_indices, int clipped_count, float zoom, 
       int *stroke_coords_from,
       float *xs,
       float *ys,
       unsigned char *pressures,
       char **result_buffer,
       int *result_count)
{
    if (clipped_count == 0) {
        result_count[0] = 0;
        return;
    }

    int first_stroke = clipped_indices[0];
    int last_stroke = clipped_indices[clipped_count - 1];
    int total_points = 0;

    for (int i = 0; i < clipped_count; ++i) {
        int stroke_index = clipped_indices[i];
        total_points += stroke_coords_from[stroke_index + 1] - stroke_coords_from[stroke_index];
    }

    int *segments_from = alloc_dynamic((clipped_count + 1) * 4);
    int *segments = alloc_dynamic(total_points * 4); // TODO: this is a very conservative estimate, we can lower memory usage if we get this tighter
    
    int segments_head = 0;
    int stack[4096]; // TODO: what's a reasonable max size for this?
    
    for (int i = 0; i < clipped_count; ++i) {
        int stroke_index = clipped_indices[i];
        
        // TODO: convert to a proper CSR, save half the memory
        int coords_from = stroke_coords_from[stroke_index];
        int coords_to = stroke_coords_from[stroke_index + 1];
        
        int point_count = coords_to - coords_from;
        
        // Basic CSR crap
        segments_from[i] = segments_head;
        
        int segment_count = 2;
        int stack_head = 0;
        
        segments[segments_head++] = 0;
        
        stack[stack_head++] = 0;
        stack[stack_head++] = 0;
        stack[stack_head++] = point_count - 1;
        
        while (stack_head > 0) {
            int end = stack[--stack_head];
            int start = stack[--stack_head];
            int type = stack[--stack_head];
            
            if (type == 1) {
                segments[segments_head++] = start;
            } else {
                int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end);
                if (max != -1) {
                    segment_count += 1;
                    
                    stack[stack_head++] = 0;
                    stack[stack_head++] = max;
                    stack[stack_head++] = end;
                    
                    stack[stack_head++] = 1;
                    stack[stack_head++] = max;
                    stack[stack_head++] = -1;
                    
                    stack[stack_head++] = 0;
                    stack[stack_head++] = start;
                    stack[stack_head++] = max;
                }
            }
        }
        
        segments[segments_head++] = point_count - 1;
    }

    segments_from[clipped_count] = segments_head;

    // Write actual coordinates (points) and stroke ids
    // Do this in one allocation so that they're not interleaved between threads
    char *output = alloc_dynamic(segments_head * (3 * 4 + 1));
    float *points = (float *) output;
    int *ids = (int *) (output + segments_head * 4 * 2);
    unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);

    int phead = 0;
    int ihead = 0;

    for (int i = 0; i < clipped_count; ++i) {
        int stroke_index = clipped_indices[i];
        
        // TODO: convert to a proper CSR, save half the memory
        int base_stroke = stroke_coords_from[stroke_index];
        int from = segments_from[i];
        int to = segments_from[i + 1];

        for (int j = from; j < to; ++j) {
            int point_index = segments[j];
            float x = xs[base_stroke + point_index];
            float y = ys[base_stroke + point_index];

            points[phead++] = x;
            points[phead++] = y;

            pressures_res[ihead] = pressures[base_stroke + point_index];

            if (j != to - 1) {
                ids[ihead++] = stroke_index;
            } else {
                ids[ihead++] = stroke_index | (1 << 31);
            }
        }
    }
    
    result_buffer[0] = output;
    result_count[0] = segments_head;
}

// NOT thread-safe, only call from one thread
char *
merge_results(int *segment_counts, char **buffers, int nthreads)
{
    int total_segments = 0;

    for (int i = 0; i < nthreads; ++i) {
        total_segments += segment_counts[i];
    }

    char *merged = alloc_dynamic(total_segments * (3 * 4 + 1));
    
    float *points = (float *) merged;
    int *ids = (int *) (merged + total_segments * 4 * 2);
    unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);

    for (int i = 0; i < nthreads; ++i) {
        int segments = segment_counts[i];
        if (segments > 0) {
            __builtin_memcpy(points, buffers[i], segments * 4 * 2);
            __builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
            __builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);

            points += segments * 2;
            ids += segments;
            pressures += segments;
        }
    }

    segment_counts[0] = total_segments;

    return(merged);
}
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`// clang -Oz --target=wasm32 -nostdlib -Wl,--export-all,--no-entry -msimd128 lod.c -o lod.wasm`
AoS -> SoA for point coordinates 8 months ago
Fix wrong wasm allocation, remove unused js LOD code, take radius of stroke into account when doing LOD, reduce EPS for LOD 8 months ago			`#include <wasm_simd128.h>`

Successfull sum function in wasm :D 10 months ago			`extern char __heap_base;`
Store WASM-processed data in WASM memory 10 months ago
			`static int allocated_static;`
			`static int allocated_dynamic;`

A small test WASM module to demonstrate how to use shared memory and run multiple threads in WASM without Emscripten 8 months ago			`void`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`set_sp(char *sp)`
A small test WASM module to demonstrate how to use shared memory and run multiple threads in WASM without Emscripten 8 months ago			`{`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`__asm__ __volatile__(`
			`".globaltype __stack_pointer, i32\n"`
			`"local.get %0\n"`
A small test WASM module to demonstrate how to use shared memory and run multiple threads in WASM without Emscripten 8 months ago			`"global.set __stack_pointer\n"`
			`: : "r"(sp)`
			`);`
			`}`

Store WASM-processed data in WASM memory 10 months ago			`void`
			`free_static(void)`
			`{`
			`allocated_static = 0;`
			`}`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago
LOD generation has been wassembled! (a little borked for now though) 10 months ago			`void`
Store WASM-processed data in WASM memory 10 months ago			`free_dynamic(void)`
LOD generation has been wassembled! (a little borked for now though) 10 months ago			`{`
Store WASM-processed data in WASM memory 10 months ago			`allocated_dynamic = 0;`
LOD generation has been wassembled! (a little borked for now though) 10 months ago			`}`

Successfull sum function in wasm :D 10 months ago			`void *`
Store WASM-processed data in WASM memory 10 months ago			`alloc_static(int size)`
Successfull sum function in wasm :D 10 months ago			`{`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`// This IS NOT thread-safe`
Store WASM-processed data in WASM memory 10 months ago			`void *result = &__heap_base + allocated_static;`
			`allocated_static += size;`
			`return(result);`
			`}`

Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`static int`
			`round_to_pow2(int value, int multiple)`
			`{`
			`return((value + multiple - 1) & -multiple);`
			`}`

Store WASM-processed data in WASM memory 10 months ago			`void *`
			`alloc_dynamic(int size)`
			`{`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`// Very ad-van-ced thread-safe allocator`
			`// CAN be called from multiple threads`
			`size = round_to_pow2(size, 4);`
			`int original_allocated_dynamic = __atomic_fetch_add(&allocated_dynamic, size, __ATOMIC_SEQ_CST);`
			`void *result = &__heap_base + allocated_static + original_allocated_dynamic;`
Successfull sum function in wasm :D 10 months ago			`return(result);`
			`}`

Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`static int`
AoS -> SoA for point coordinates 8 months ago			`rdp_find_max(float xs, float ys, unsigned char *pressures, float zoom, int coords_from,`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`int segment_start, int segment_end)`
			`{`
			`int result = -1;`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
			`if (segment_start == segment_end) {`
			`return(result);`
			`}`

			`float EPS = 0.125f / zoom * 255.0f;`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`float max_dist = 0.0f;`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
AoS -> SoA for point coordinates 8 months ago			`float ax = xs[coords_from + segment_start];`
			`float ay = ys[coords_from + segment_start];`
			`float bx = xs[coords_from + segment_end];`
			`float by = ys[coords_from + segment_end];`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
Fix strokes being added to storage even when skipped by SN logic 10 months ago			`unsigned char ap = pressures[coords_from / 2 + segment_start];`
			`unsigned char bp = pressures[coords_from / 2 + segment_end];`

Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`float dx = bx - ax;`
			`float dy = by - ay;`

Successfull sum function in wasm :D 10 months ago			`float dist_ab = __builtin_sqrtf(dx * dx + dy * dy);`
AoS -> SoA for point coordinates 8 months ago			`float dir_nx = dy / dist_ab * 255.0f;`
			`float dir_ny = -dx / dist_ab * 255.0f;`
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago
Oops, actually enable SIMD 8 months ago			`#if 0`
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`for (int i = segment_start + 1; i < segment_end; ++i) {`
			`float px = xs[coords_from + i];`
			`float py = ys[coords_from + i];`
AoS -> SoA for point coordinates 8 months ago
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`unsigned char pp = pressures[coords_from + i];`

			`float apx = px - ax;`
			`float apy = py - ay;`

			`float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)`
			`+ __builtin_abs(pp - ap) + __builtin_abs(pp - bp);`

			`if (dist > EPS && dist > max_dist) {`
			`result = i;`
			`max_dist = dist;`
			`}`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`}`
			`#else`
			`v128_t ax_x4 = wasm_f32x4_splat(ax);`
			`v128_t ay_x4 = wasm_f32x4_splat(ay);`
			`v128_t ap_x4 = wasm_f32x4_splat(ap);`
			`v128_t bp_x4 = wasm_f32x4_splat(bp);`
			`v128_t dir_nx_x4 = wasm_f32x4_splat(dir_nx);`
			`v128_t dir_ny_x4 = wasm_f32x4_splat(dir_ny);`

			`v128_t index_x4 = wasm_u32x4_make(segment_start + 1, segment_start + 2, segment_start + 3, segment_start + 4);`
			`v128_t four_x4 = wasm_u32x4_const_splat(4);`
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`v128_t max_dist_x4 = wasm_f32x4_splat(EPS);`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`v128_t max_index_x4 = wasm_u32x4_const_splat(-1);`

			`for (int i = segment_start + 1; i < segment_end - 3; i += 4) {`
			`v128_t px_x4 = wasm_v128_load(xs + coords_from + i);`
			`v128_t py_x4 = wasm_v128_load(ys + coords_from + i);`

Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`v128_t pp_x4 = wasm_f32x4_make(`
			`pressures[coords_from / 2 + i + 0],`
			`pressures[coords_from / 2 + i + 1],`
			`pressures[coords_from / 2 + i + 2],`
			`pressures[coords_from / 2 + i + 3]`
			`);`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
			`v128_t apx_x4 = wasm_f32x4_sub(px_x4, ax_x4);`
			`v128_t apy_x4 = wasm_f32x4_sub(py_x4, ay_x4);`

			`v128_t dist_x4 = wasm_f32x4_add(`
AoS -> SoA for point coordinates 8 months ago			`wasm_f32x4_add(`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, ap_x4)),`
			`wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, bp_x4))`
AoS -> SoA for point coordinates 8 months ago			`),`
			`wasm_f32x4_abs(`
			`wasm_f32x4_add(`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`wasm_f32x4_mul(apx_x4, dir_nx_x4),`
			`wasm_f32x4_mul(apy_x4, dir_ny_x4)`
AoS -> SoA for point coordinates 8 months ago			`)`
			`)`
			`);`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`v128_t mask = wasm_f32x4_gt(dist_x4, max_dist_x4);`
AoS -> SoA for point coordinates 8 months ago
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`max_index_x4 = wasm_v128_bitselect(index_x4, max_index_x4, mask);`
			`max_dist_x4 = wasm_v128_bitselect(dist_x4, max_dist_x4, mask);`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
			`index_x4 = wasm_i32x4_add(index_x4, four_x4);`
			`}`
AoS -> SoA for point coordinates 8 months ago
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`int indices[4];`
			`float values[4];`

			`wasm_v128_store(indices, max_index_x4);`
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`wasm_v128_store(values, max_dist_x4);`
AoS -> SoA for point coordinates 8 months ago
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`for (int i = 0; i < 4; ++i) {`
			`if (indices[i] != -1) {`
			`if (values[i] > max_dist) {`
			`result = indices[i];`
			`max_dist = values[i];`
			`}`
			`}`
			`}`

Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago			`if (max_dist == EPS) {`
			`max_dist = 0.0f;`
			`result = -1;`
			`}`

SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`int remainder = (segment_end - segment_start - 1) % 4;`

			`for (int i = segment_end - remainder; i < segment_end; ++i) {`
AoS -> SoA for point coordinates 8 months ago			`float px = xs[coords_from + i];`
			`float py = ys[coords_from + i];`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
Fix strokes being added to storage even when skipped by SN logic 10 months ago			`unsigned char pp = pressures[coords_from + i];`

Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`float apx = px - ax;`
			`float apy = py - ay;`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
Fix wrong wasm allocation, remove unused js LOD code, take radius of stroke into account when doing LOD, reduce EPS for LOD 8 months ago			`float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)`
AoS -> SoA for point coordinates 8 months ago			`+ __builtin_abs(pp - ap) + __builtin_abs(pp - bp);`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`if (dist > EPS && dist > max_dist) {`
			`result = i;`
			`max_dist = dist;`
			`}`
			`}`
SIMD version of rdp_find_max. 16->12ms cpu frametime firefox, 16->9ms chrome 8 months ago			`#endif`
Change extracts of pressure byutes to a f32x4_make. Change andnot masking to a bitselect. Rename readme.md to readme.txt to hopefully fix gitea formatting 8 months ago
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`return(result);`
			`}`

Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`void`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`do_lod(int *clipped_indices, int clipped_count, float zoom,`
Store WASM-processed data in WASM memory 10 months ago			`int *stroke_coords_from,`
AoS -> SoA for point coordinates 8 months ago			`float *xs,`
			`float *ys,`
First working draft of pressure hanlding 10 months ago			`unsigned char *pressures,`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`char **result_buffer,`
			`int *result_count)`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`{`
LOD generation has been wassembled! (a little borked for now though) 10 months ago			`if (clipped_count == 0) {`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`result_count[0] = 0;`
			`return;`
LOD generation has been wassembled! (a little borked for now though) 10 months ago			`}`
Store WASM-processed data in WASM memory 10 months ago
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`int first_stroke = clipped_indices[0];`
			`int last_stroke = clipped_indices[clipped_count - 1];`
Correcty complute total point count. Only call glClear once workers have finished LOD. Only allow next draw() call after we finished rendering frame 8 months ago			`int total_points = 0;`

			`for (int i = 0; i < clipped_count; ++i) {`
			`int stroke_index = clipped_indices[i];`
			`total_points += stroke_coords_from[stroke_index + 1] - stroke_coords_from[stroke_index];`
			`}`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago
Store WASM-processed data in WASM memory 10 months ago			`int segments_from = alloc_dynamic((clipped_count + 1) 4);`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`int segments = alloc_dynamic(total_points 4); // TODO: this is a very conservative estimate, we can lower memory usage if we get this tighter`
LOD generation has been wassembled! (a little borked for now though) 10 months ago
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`int segments_head = 0;`
LOD generation has been wassembled! (a little borked for now though) 10 months ago			`int stack[4096]; // TODO: what's a reasonable max size for this?`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago
			`for (int i = 0; i < clipped_count; ++i) {`
			`int stroke_index = clipped_indices[i];`

			`// TODO: convert to a proper CSR, save half the memory`
			`int coords_from = stroke_coords_from[stroke_index];`
Store WASM-processed data in WASM memory 10 months ago			`int coords_to = stroke_coords_from[stroke_index + 1];`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago
AoS -> SoA for point coordinates 8 months ago			`int point_count = coords_to - coords_from;`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago
			`// Basic CSR crap`
			`segments_from[i] = segments_head;`

			`int segment_count = 2;`
			`int stack_head = 0;`

			`segments[segments_head++] = 0;`

			`stack[stack_head++] = 0;`
			`stack[stack_head++] = 0;`
			`stack[stack_head++] = point_count - 1;`

			`while (stack_head > 0) {`
			`int end = stack[--stack_head];`
			`int start = stack[--stack_head];`
			`int type = stack[--stack_head];`

			`if (type == 1) {`
			`segments[segments_head++] = start;`
			`} else {`
AoS -> SoA for point coordinates 8 months ago			`int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end);`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago			`if (max != -1) {`
			`segment_count += 1;`

			`stack[stack_head++] = 0;`
			`stack[stack_head++] = max;`
			`stack[stack_head++] = end;`

			`stack[stack_head++] = 1;`
			`stack[stack_head++] = max;`
			`stack[stack_head++] = -1;`

			`stack[stack_head++] = 0;`
			`stack[stack_head++] = start;`
			`stack[stack_head++] = max;`
			`}`
			`}`
			`}`

			`segments[segments_head++] = point_count - 1;`
			`}`
LOD generation has been wassembled! (a little borked for now though) 10 months ago
			`segments_from[clipped_count] = segments_head;`
Move point writes to WASM, already a lot faster!!!! 10 months ago
			`// Write actual coordinates (points) and stroke ids`
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`// Do this in one allocation so that they're not interleaved between threads`
			`char output = alloc_dynamic(segments_head (3 * 4 + 1));`
			`float points = (float ) output;`
			`int ids = (int ) (output + segments_head * 4 * 2);`
			`unsigned char pressures_res = (unsigned char ) (output + segments_head * 4 * 3);`
Move point writes to WASM, already a lot faster!!!! 10 months ago
			`int phead = 0;`
			`int ihead = 0;`

			`for (int i = 0; i < clipped_count; ++i) {`
			`int stroke_index = clipped_indices[i];`

			`// TODO: convert to a proper CSR, save half the memory`
			`int base_stroke = stroke_coords_from[stroke_index];`
			`int from = segments_from[i];`
			`int to = segments_from[i + 1];`

			`for (int j = from; j < to; ++j) {`
			`int point_index = segments[j];`
AoS -> SoA for point coordinates 8 months ago			`float x = xs[base_stroke + point_index];`
			`float y = ys[base_stroke + point_index];`
Move point writes to WASM, already a lot faster!!!! 10 months ago
			`points[phead++] = x;`
			`points[phead++] = y;`

AoS -> SoA for point coordinates 8 months ago			`pressures_res[ihead] = pressures[base_stroke + point_index];`
First working draft of pressure hanlding 10 months ago
Move point writes to WASM, already a lot faster!!!! 10 months ago			`if (j != to - 1) {`
			`ids[ihead++] = stroke_index;`
			`} else {`
			`ids[ihead++] = stroke_index \| (1 << 31);`
			`}`
			`}`
			`}`
Redraw HTML on canvas move, first draft of wasm LOD core 10 months ago
Multithreading works! Kinda sorta (slows down in chrome, out of bounds accesses on phone) 8 months ago			`result_buffer[0] = output;`
			`result_count[0] = segments_head;`
			`}`

			`// NOT thread-safe, only call from one thread`
			`char *`
			`merge_results(int segment_counts, char *buffers, int nthreads)`
			`{`
			`int total_segments = 0;`

			`for (int i = 0; i < nthreads; ++i) {`
			`total_segments += segment_counts[i];`
			`}`

			`char merged = alloc_dynamic(total_segments (3 * 4 + 1));`

			`float points = (float ) merged;`
			`int ids = (int ) (merged + total_segments * 4 * 2);`
			`unsigned char pressures = (unsigned char ) (merged + total_segments * 4 * 3);`

			`for (int i = 0; i < nthreads; ++i) {`
			`int segments = segment_counts[i];`
			`if (segments > 0) {`
			`__builtin_memcpy(points, buffers[i], segments * 4 * 2);`
			`__builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);`
			`__builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);`

			`points += segments * 2;`
			`ids += segments;`
			`pressures += segments;`
			`}`
			`}`

			`segment_counts[0] = total_segments;`

			`return(merged);`
Successfull sum function in wasm :D 10 months ago			`}`