// clang -g -Wall -Wextra -O3 -Wl,--export-all,--no-entry --target=wasm32 -Xclang -target-feature -Xclang +simd128 lod.c -nostdlib -o lod.wasm #include extern char __heap_base; static int allocated_static; static int allocated_dynamic; void free_static(void) { allocated_static = 0; } void free_dynamic(void) { allocated_dynamic = 0; } void * alloc_static(int size) { void *result = &__heap_base + allocated_static; allocated_static += size; return(result); } void * alloc_dynamic(int size) { void *result = &__heap_base + allocated_static + allocated_dynamic; allocated_dynamic += size; return(result); } static int rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from, int segment_start, int segment_end) { float EPS = 0.125f / zoom * 255.0f; int result = -1; float max_dist = 0.0f; float ax = xs[coords_from + segment_start]; float ay = ys[coords_from + segment_start]; float bx = xs[coords_from + segment_end]; float by = ys[coords_from + segment_end]; unsigned char ap = pressures[coords_from / 2 + segment_start]; unsigned char bp = pressures[coords_from / 2 + segment_end]; float dx = bx - ax; float dy = by - ay; float dist_ab = __builtin_sqrtf(dx * dx + dy * dy); float dir_nx = dy / dist_ab * 255.0f; float dir_ny = -dx / dist_ab * 255.0f; #if 0 v128_t scale_255 = wasm_f32x4_splat(1.0f / 255.0f); v128_t EPSs = wasm_f32x4_splat(EPS); #endif for (int i = segment_start + 1; i < segment_end; ++i) { #if 0 v128_t pxs = wasm_v128_load(coordinates_x + coords_from + i); v128_t pxs = wasm_v128_load(coordinates_y + coords_from + i); v128_t pps = wasm_v128_load(pressures + coords_from + i); v128_t apxs = wasm_f32x4_sub(pxs, axs); v128_t apys = wasm_f32x4_sub(pys, ays); v128_t dists = wasm_f32x4_add( wasm_f32x4_add( wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, aps)), scale_255), wasm_f32x4_mul(wasm_f32x4_abs(wasm_f32x4_sub(pps, bps)), scale_255) ), wasm_f32x4_abs( wasm_f32x4_add( wasm_f32x4_mul(apxs, dir_nxs), wasm_f32x4_mul(apys, dir_nys) ) ) ); v128_t dist_mask = wasm_f32x4_gt(dists, EPSs); v128_t max_mask = wasm_f32x4_gt(dists, max_dists); v128_t final_mask = wasm_v128_and(dist_mask, max_mask); if (!wasm_v128_any_true(final_mask)) { // fast path? hopefully? continue; } // Places max(0, 2) and max(1, 3) into lanes (0, 1) v128_t max_02_13 = wasm_f32x4_max( dists, wasm_i32x4_shuffle(dists, dists, 2, 3, 2, 3) ); // Places max(max(0, 2), max(1, 3)) into lane 0 v128_t max_0123 = wasm_f32x4_max( max_02_13, wasm_i32x4_shuffle(max_02_13, max_02_13, 1, 1, 1, 1) ); float final_max = wasm_f32x4_extract_lane(max_0123, 0); #endif float px = xs[coords_from + i]; float py = ys[coords_from + i]; unsigned char pp = pressures[coords_from + i]; float apx = px - ax; float apy = py - ay; float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny) + __builtin_abs(pp - ap) + __builtin_abs(pp - bp); if (dist > EPS && dist > max_dist) { result = i; max_dist = dist; } } return(result); } int do_lod(int *clipped_indices, int clipped_count, float zoom, int *stroke_coords_from, float *line_threshold, float *xs, float *ys, unsigned char *pressures, int coordinates_count) { if (clipped_count == 0) { return(0); } int *segments_from = alloc_dynamic((clipped_count + 1) * 4); int *segments = alloc_dynamic(coordinates_count * 4); int segments_head = 0; int stack[4096]; // TODO: what's a reasonable max size for this? for (int i = 0; i < clipped_count; ++i) { int stroke_index = clipped_indices[i]; // TODO: convert to a proper CSR, save half the memory int coords_from = stroke_coords_from[stroke_index]; int coords_to = stroke_coords_from[stroke_index + 1]; int point_count = coords_to - coords_from; // Basic CSR crap segments_from[i] = segments_head; if (zoom < line_threshold[stroke_index]) { // Fast paths for collapsing to a single line segment segments[segments_head++] = 0; segments[segments_head++] = point_count - 1; continue; } int segment_count = 2; int stack_head = 0; segments[segments_head++] = 0; stack[stack_head++] = 0; stack[stack_head++] = 0; stack[stack_head++] = point_count - 1; while (stack_head > 0) { int end = stack[--stack_head]; int start = stack[--stack_head]; int type = stack[--stack_head]; if (type == 1) { segments[segments_head++] = start; } else { int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end); if (max != -1) { segment_count += 1; stack[stack_head++] = 0; stack[stack_head++] = max; stack[stack_head++] = end; stack[stack_head++] = 1; stack[stack_head++] = max; stack[stack_head++] = -1; stack[stack_head++] = 0; stack[stack_head++] = start; stack[stack_head++] = max; } } } segments[segments_head++] = point_count - 1; if (segment_count == 2 && zoom > line_threshold[stroke_index]) { line_threshold[stroke_index] = zoom; } } segments_from[clipped_count] = segments_head; // Write actual coordinates (points) and stroke ids float *points = alloc_dynamic(segments_head * 2 * 4); int *ids = alloc_dynamic(segments_head * 4); unsigned char *pressures_res = alloc_dynamic(segments_head); int phead = 0; int ihead = 0; for (int i = 0; i < clipped_count; ++i) { int stroke_index = clipped_indices[i]; // TODO: convert to a proper CSR, save half the memory int base_stroke = stroke_coords_from[stroke_index]; int from = segments_from[i]; int to = segments_from[i + 1]; for (int j = from; j < to; ++j) { int point_index = segments[j]; float x = xs[base_stroke + point_index]; float y = ys[base_stroke + point_index]; points[phead++] = x; points[phead++] = y; pressures_res[ihead] = pressures[base_stroke + point_index]; if (j != to - 1) { ids[ihead++] = stroke_index; } else { ids[ihead++] = stroke_index | (1 << 31); } } } return(segments_head); }