// clang -Oz --target=wasm32 -nostdlib -Wl,--export-all,--no-entry -msimd128 lod.c -o lod.wasm #include extern char __heap_base; static int allocated_static; static int allocated_dynamic; void set_sp(char *sp) { __asm__ __volatile__( ".globaltype __stack_pointer, i32\n" "local.get %0\n" "global.set __stack_pointer\n" : : "r"(sp) ); } void free_static(void) { allocated_static = 0; } void free_dynamic(void) { allocated_dynamic = 0; } void * alloc_static(int size) { // This IS NOT thread-safe void *result = &__heap_base + allocated_static; allocated_static += size; return(result); } static int round_to_pow2(int value, int multiple) { return((value + multiple - 1) & -multiple); } void * alloc_dynamic(int size) { // Very ad-van-ced thread-safe allocator // CAN be called from multiple threads size = round_to_pow2(size, 4); int original_allocated_dynamic = __atomic_fetch_add(&allocated_dynamic, size, __ATOMIC_SEQ_CST); void *result = &__heap_base + allocated_static + original_allocated_dynamic; return(result); } static int rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from, int segment_start, int segment_end) { int result = -1; if (segment_start == segment_end) { return(result); } float EPS = 0.125f / zoom * 255.0f; float max_dist = 0.0f; float ax = xs[coords_from + segment_start]; float ay = ys[coords_from + segment_start]; float bx = xs[coords_from + segment_end]; float by = ys[coords_from + segment_end]; unsigned char ap = pressures[coords_from / 2 + segment_start]; unsigned char bp = pressures[coords_from / 2 + segment_end]; float dx = bx - ax; float dy = by - ay; float dist_ab = __builtin_sqrtf(dx * dx + dy * dy); float dir_nx = dy / dist_ab * 255.0f; float dir_ny = -dx / dist_ab * 255.0f; #if 0 for (int i = segment_start + 1; i < segment_end; ++i) { float px = xs[coords_from + i]; float py = ys[coords_from + i]; unsigned char pp = pressures[coords_from + i]; float apx = px - ax; float apy = py - ay; float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) + __builtin_abs(pp - ap) + __builtin_abs(pp - bp); if (dist > EPS && dist > max_dist) { result = i; max_dist = dist; } } #else v128_t ax_x4 = wasm_f32x4_splat(ax); v128_t ay_x4 = wasm_f32x4_splat(ay); v128_t ap_x4 = wasm_f32x4_splat(ap); v128_t bp_x4 = wasm_f32x4_splat(bp); v128_t dir_nx_x4 = wasm_f32x4_splat(dir_nx); v128_t dir_ny_x4 = wasm_f32x4_splat(dir_ny); v128_t index_x4 = wasm_u32x4_make(segment_start + 1, segment_start + 2, segment_start + 3, segment_start + 4); v128_t four_x4 = wasm_u32x4_const_splat(4); v128_t max_dist_x4 = wasm_f32x4_splat(EPS); v128_t max_index_x4 = wasm_u32x4_const_splat(-1); for (int i = segment_start + 1; i < segment_end - 3; i += 4) { v128_t px_x4 = wasm_v128_load(xs + coords_from + i); v128_t py_x4 = wasm_v128_load(ys + coords_from + i); v128_t pp_x4 = wasm_f32x4_make( pressures[coords_from / 2 + i + 0], pressures[coords_from / 2 + i + 1], pressures[coords_from / 2 + i + 2], pressures[coords_from / 2 + i + 3] ); v128_t apx_x4 = wasm_f32x4_sub(px_x4, ax_x4); v128_t apy_x4 = wasm_f32x4_sub(py_x4, ay_x4); v128_t dist_x4 = wasm_f32x4_add( wasm_f32x4_add( wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, ap_x4)), wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, bp_x4)) ), wasm_f32x4_abs( wasm_f32x4_add( wasm_f32x4_mul(apx_x4, dir_nx_x4), wasm_f32x4_mul(apy_x4, dir_ny_x4) ) ) ); v128_t mask = wasm_f32x4_gt(dist_x4, max_dist_x4); max_index_x4 = wasm_v128_bitselect(index_x4, max_index_x4, mask); max_dist_x4 = wasm_v128_bitselect(dist_x4, max_dist_x4, mask); index_x4 = wasm_i32x4_add(index_x4, four_x4); } int indices[4]; float values[4]; wasm_v128_store(indices, max_index_x4); wasm_v128_store(values, max_dist_x4); for (int i = 0; i < 4; ++i) { if (indices[i] != -1) { if (values[i] > max_dist) { result = indices[i]; max_dist = values[i]; } } } if (max_dist == EPS) { max_dist = 0.0f; result = -1; } int remainder = (segment_end - segment_start - 1) % 4; for (int i = segment_end - remainder; i < segment_end; ++i) { float px = xs[coords_from + i]; float py = ys[coords_from + i]; unsigned char pp = pressures[coords_from + i]; float apx = px - ax; float apy = py - ay; float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) + __builtin_abs(pp - ap) + __builtin_abs(pp - bp); if (dist > EPS && dist > max_dist) { result = i; max_dist = dist; } } #endif return(result); } void do_lod(int *clipped_indices, int clipped_count, float zoom, int *stroke_coords_from, float *xs, float *ys, unsigned char *pressures, char **result_buffer, int *result_count) { if (clipped_count == 0) { result_count[0] = 0; return; } int first_stroke = clipped_indices[0]; int last_stroke = clipped_indices[clipped_count - 1]; int total_points = 0; for (int i = 0; i < clipped_count; ++i) { int stroke_index = clipped_indices[i]; total_points += stroke_coords_from[stroke_index + 1] - stroke_coords_from[stroke_index]; } int *segments_from = alloc_dynamic((clipped_count + 1) * 4); int *segments = alloc_dynamic(total_points * 4); // TODO: this is a very conservative estimate, we can lower memory usage if we get this tighter int segments_head = 0; int stack[4096]; // TODO: what's a reasonable max size for this? for (int i = 0; i < clipped_count; ++i) { int stroke_index = clipped_indices[i]; // TODO: convert to a proper CSR, save half the memory int coords_from = stroke_coords_from[stroke_index]; int coords_to = stroke_coords_from[stroke_index + 1]; int point_count = coords_to - coords_from; // Basic CSR crap segments_from[i] = segments_head; int segment_count = 2; int stack_head = 0; segments[segments_head++] = 0; stack[stack_head++] = 0; stack[stack_head++] = 0; stack[stack_head++] = point_count - 1; while (stack_head > 0) { int end = stack[--stack_head]; int start = stack[--stack_head]; int type = stack[--stack_head]; if (type == 1) { segments[segments_head++] = start; } else { int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end); if (max != -1) { segment_count += 1; stack[stack_head++] = 0; stack[stack_head++] = max; stack[stack_head++] = end; stack[stack_head++] = 1; stack[stack_head++] = max; stack[stack_head++] = -1; stack[stack_head++] = 0; stack[stack_head++] = start; stack[stack_head++] = max; } } } segments[segments_head++] = point_count - 1; } segments_from[clipped_count] = segments_head; // Write actual coordinates (points) and stroke ids // Do this in one allocation so that they're not interleaved between threads char *output = alloc_dynamic(segments_head * (3 * 4 + 1)); float *points = (float *) output; int *ids = (int *) (output + segments_head * 4 * 2); unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3); int phead = 0; int ihead = 0; for (int i = 0; i < clipped_count; ++i) { int stroke_index = clipped_indices[i]; // TODO: convert to a proper CSR, save half the memory int base_stroke = stroke_coords_from[stroke_index]; int from = segments_from[i]; int to = segments_from[i + 1]; for (int j = from; j < to; ++j) { int point_index = segments[j]; float x = xs[base_stroke + point_index]; float y = ys[base_stroke + point_index]; points[phead++] = x; points[phead++] = y; pressures_res[ihead] = pressures[base_stroke + point_index]; if (j != to - 1) { ids[ihead++] = stroke_index; } else { ids[ihead++] = stroke_index | (1 << 31); } } } result_buffer[0] = output; result_count[0] = segments_head; } // NOT thread-safe, only call from one thread char * merge_results(int *segment_counts, char **buffers, int nthreads) { int total_segments = 0; for (int i = 0; i < nthreads; ++i) { total_segments += segment_counts[i]; } char *merged = alloc_dynamic(total_segments * (3 * 4 + 1)); float *points = (float *) merged; int *ids = (int *) (merged + total_segments * 4 * 2); unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3); for (int i = 0; i < nthreads; ++i) { int segments = segment_counts[i]; if (segments > 0) { __builtin_memcpy(points, buffers[i], segments * 4 * 2); __builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4); __builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments); points += segments * 2; ids += segments; pressures += segments; } } segment_counts[0] = total_segments; return(merged); }