|
|
|
#include <wasm_simd128.h>
|
|
|
|
|
|
|
|
extern char __heap_base;
|
|
|
|
|
|
|
|
static int allocated_static;
|
|
|
|
static int allocated_dynamic;
|
|
|
|
|
|
|
|
void
|
|
|
|
set_sp(char *sp)
|
|
|
|
{
|
|
|
|
__asm__ __volatile__(
|
|
|
|
".globaltype __stack_pointer, i32\n"
|
|
|
|
"local.get %0\n"
|
|
|
|
"global.set __stack_pointer\n"
|
|
|
|
: : "r"(sp)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
free_static(void)
|
|
|
|
{
|
|
|
|
allocated_static = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
free_dynamic(void)
|
|
|
|
{
|
|
|
|
allocated_dynamic = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
alloc_static(int size)
|
|
|
|
{
|
|
|
|
// This IS NOT thread-safe
|
|
|
|
void *result = &__heap_base + allocated_static;
|
|
|
|
allocated_static += size;
|
|
|
|
return(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
round_to_pow2(int value, int multiple)
|
|
|
|
{
|
|
|
|
return((value + multiple - 1) & -multiple);
|
|
|
|
}
|
|
|
|
|
|
|
|
void *
|
|
|
|
alloc_dynamic(int size)
|
|
|
|
{
|
|
|
|
// Very ad-van-ced thread-safe allocator
|
|
|
|
// CAN be called from multiple threads
|
|
|
|
size = round_to_pow2(size, 4);
|
|
|
|
int original_allocated_dynamic = __atomic_fetch_add(&allocated_dynamic, size, __ATOMIC_SEQ_CST);
|
|
|
|
void *result = &__heap_base + allocated_static + original_allocated_dynamic;
|
|
|
|
return(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
rdp_find_max(float *xs, float *ys, unsigned char *pressures, float zoom, int coords_from,
|
|
|
|
int segment_start, int segment_end)
|
|
|
|
{
|
|
|
|
int result = -1;
|
|
|
|
|
|
|
|
if (segment_start == segment_end) {
|
|
|
|
return(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
float EPS = 0.125f / zoom * 255.0f;
|
|
|
|
float max_dist = 0.0f;
|
|
|
|
|
|
|
|
float ax = xs[coords_from + segment_start];
|
|
|
|
float ay = ys[coords_from + segment_start];
|
|
|
|
float bx = xs[coords_from + segment_end];
|
|
|
|
float by = ys[coords_from + segment_end];
|
|
|
|
|
|
|
|
unsigned char ap = pressures[coords_from / 2 + segment_start];
|
|
|
|
unsigned char bp = pressures[coords_from / 2 + segment_end];
|
|
|
|
|
|
|
|
float dx = bx - ax;
|
|
|
|
float dy = by - ay;
|
|
|
|
|
|
|
|
float dist_ab = __builtin_sqrtf(dx * dx + dy * dy);
|
|
|
|
float dir_nx = dy / dist_ab * 255.0f;
|
|
|
|
float dir_ny = -dx / dist_ab * 255.0f;
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
// Scalar version preserved for reference
|
|
|
|
|
|
|
|
for (int i = segment_start + 1; i < segment_end; ++i) {
|
|
|
|
float px = xs[coords_from + i];
|
|
|
|
float py = ys[coords_from + i];
|
|
|
|
|
|
|
|
unsigned char pp = pressures[coords_from + i];
|
|
|
|
|
|
|
|
float apx = px - ax;
|
|
|
|
float apy = py - ay;
|
|
|
|
|
|
|
|
float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
|
|
|
|
+ __builtin_abs(pp - ap) + __builtin_abs(pp - bp);
|
|
|
|
|
|
|
|
if (dist > EPS && dist > max_dist) {
|
|
|
|
result = i;
|
|
|
|
max_dist = dist;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
v128_t ax_x4 = wasm_f32x4_splat(ax);
|
|
|
|
v128_t ay_x4 = wasm_f32x4_splat(ay);
|
|
|
|
v128_t ap_x4 = wasm_f32x4_splat(ap);
|
|
|
|
v128_t bp_x4 = wasm_f32x4_splat(bp);
|
|
|
|
v128_t dir_nx_x4 = wasm_f32x4_splat(dir_nx);
|
|
|
|
v128_t dir_ny_x4 = wasm_f32x4_splat(dir_ny);
|
|
|
|
|
|
|
|
v128_t index_x4 = wasm_u32x4_make(segment_start + 1, segment_start + 2, segment_start + 3, segment_start + 4);
|
|
|
|
v128_t four_x4 = wasm_u32x4_const_splat(4);
|
|
|
|
v128_t max_dist_x4 = wasm_f32x4_splat(EPS);
|
|
|
|
v128_t max_index_x4 = wasm_u32x4_const_splat(-1);
|
|
|
|
|
|
|
|
for (int i = segment_start + 1; i < segment_end - 3; i += 4) {
|
|
|
|
v128_t px_x4 = wasm_v128_load(xs + coords_from + i);
|
|
|
|
v128_t py_x4 = wasm_v128_load(ys + coords_from + i);
|
|
|
|
|
|
|
|
v128_t pp_x4 = wasm_f32x4_make(
|
|
|
|
pressures[coords_from / 2 + i + 0],
|
|
|
|
pressures[coords_from / 2 + i + 1],
|
|
|
|
pressures[coords_from / 2 + i + 2],
|
|
|
|
pressures[coords_from / 2 + i + 3]
|
|
|
|
);
|
|
|
|
|
|
|
|
v128_t apx_x4 = wasm_f32x4_sub(px_x4, ax_x4);
|
|
|
|
v128_t apy_x4 = wasm_f32x4_sub(py_x4, ay_x4);
|
|
|
|
|
|
|
|
v128_t dist_x4 = wasm_f32x4_add(
|
|
|
|
wasm_f32x4_add(
|
|
|
|
wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, ap_x4)),
|
|
|
|
wasm_f32x4_abs(wasm_f32x4_sub(pp_x4, bp_x4))
|
|
|
|
),
|
|
|
|
wasm_f32x4_abs(
|
|
|
|
wasm_f32x4_add(
|
|
|
|
wasm_f32x4_mul(apx_x4, dir_nx_x4),
|
|
|
|
wasm_f32x4_mul(apy_x4, dir_ny_x4)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
v128_t mask = wasm_f32x4_gt(dist_x4, max_dist_x4);
|
|
|
|
|
|
|
|
max_index_x4 = wasm_v128_bitselect(index_x4, max_index_x4, mask);
|
|
|
|
max_dist_x4 = wasm_v128_bitselect(dist_x4, max_dist_x4, mask);
|
|
|
|
|
|
|
|
index_x4 = wasm_i32x4_add(index_x4, four_x4);
|
|
|
|
}
|
|
|
|
|
|
|
|
int indices[4];
|
|
|
|
float values[4];
|
|
|
|
|
|
|
|
wasm_v128_store(indices, max_index_x4);
|
|
|
|
wasm_v128_store(values, max_dist_x4);
|
|
|
|
|
|
|
|
for (int i = 0; i < 4; ++i) {
|
|
|
|
if (indices[i] != -1) {
|
|
|
|
if (values[i] > max_dist) {
|
|
|
|
result = indices[i];
|
|
|
|
max_dist = values[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (max_dist == EPS) {
|
|
|
|
max_dist = 0.0f;
|
|
|
|
result = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int remainder = (segment_end - segment_start - 1) % 4;
|
|
|
|
|
|
|
|
for (int i = segment_end - remainder; i < segment_end; ++i) {
|
|
|
|
float px = xs[coords_from + i];
|
|
|
|
float py = ys[coords_from + i];
|
|
|
|
|
|
|
|
unsigned char pp = pressures[coords_from + i];
|
|
|
|
|
|
|
|
float apx = px - ax;
|
|
|
|
float apy = py - ay;
|
|
|
|
|
|
|
|
float dist = __builtin_fabsf(apx * dir_nx + apy * dir_ny)
|
|
|
|
+ __builtin_abs(pp - ap) + __builtin_abs(pp - bp);
|
|
|
|
|
|
|
|
if (dist > EPS && dist > max_dist) {
|
|
|
|
result = i;
|
|
|
|
max_dist = dist;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
do_lod(int *clipped_indices, int clipped_count, float zoom,
|
|
|
|
int *stroke_coords_from,
|
|
|
|
int *width,
|
|
|
|
float *xs,
|
|
|
|
float *ys,
|
|
|
|
unsigned char *pressures,
|
|
|
|
char **result_buffer,
|
|
|
|
int *result_count)
|
|
|
|
{
|
|
|
|
if (clipped_count == 0) {
|
|
|
|
result_count[0] = 0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
int first_stroke = clipped_indices[0];
|
|
|
|
int last_stroke = clipped_indices[clipped_count - 1];
|
|
|
|
int total_points = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < clipped_count; ++i) {
|
|
|
|
int stroke_index = clipped_indices[i];
|
|
|
|
total_points += stroke_coords_from[stroke_index + 1] - stroke_coords_from[stroke_index];
|
|
|
|
}
|
|
|
|
|
|
|
|
int *segments_from = alloc_dynamic((clipped_count + 1) * 4);
|
|
|
|
int *segments = alloc_dynamic(total_points * 4); // TODO: this is a very conservative estimate, we can lower memory usage if we get this tighter
|
|
|
|
|
|
|
|
int segments_head = 0;
|
|
|
|
int stack[4096]; // TODO: what's a reasonable max size for this?
|
|
|
|
int max_stack_size = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < clipped_count; ++i) {
|
|
|
|
int stroke_index = clipped_indices[i];
|
|
|
|
|
|
|
|
// TODO: convert to a proper CSR, save half the memory
|
|
|
|
int coords_from = stroke_coords_from[stroke_index];
|
|
|
|
int coords_to = stroke_coords_from[stroke_index + 1];
|
|
|
|
|
|
|
|
int point_count = coords_to - coords_from;
|
|
|
|
|
|
|
|
// Basic CSR crap
|
|
|
|
segments_from[i] = segments_head;
|
|
|
|
|
|
|
|
int segment_count = 2;
|
|
|
|
int stack_head = 0;
|
|
|
|
|
|
|
|
segments[segments_head++] = 0;
|
|
|
|
|
|
|
|
stack[stack_head++] = 0;
|
|
|
|
stack[stack_head++] = 0;
|
|
|
|
stack[stack_head++] = point_count - 1;
|
|
|
|
|
|
|
|
while (stack_head > 0) {
|
|
|
|
if (stack_head > max_stack_size) { max_stack_size = stack_head; }
|
|
|
|
int end = stack[--stack_head];
|
|
|
|
int start = stack[--stack_head];
|
|
|
|
int type = stack[--stack_head];
|
|
|
|
|
|
|
|
if (type == 1) {
|
|
|
|
segments[segments_head++] = start;
|
|
|
|
} else {
|
|
|
|
int max = rdp_find_max(xs, ys, pressures, zoom, coords_from, start, end);
|
|
|
|
if (max != -1) {
|
|
|
|
segment_count += 1;
|
|
|
|
|
|
|
|
stack[stack_head++] = 0;
|
|
|
|
stack[stack_head++] = max;
|
|
|
|
stack[stack_head++] = end;
|
|
|
|
|
|
|
|
stack[stack_head++] = 1;
|
|
|
|
stack[stack_head++] = max;
|
|
|
|
stack[stack_head++] = -1;
|
|
|
|
|
|
|
|
stack[stack_head++] = 0;
|
|
|
|
stack[stack_head++] = start;
|
|
|
|
stack[stack_head++] = max;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
segments[segments_head++] = point_count - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
segments_from[clipped_count] = segments_head;
|
|
|
|
|
|
|
|
// Write actual coordinates (points) and stroke ids
|
|
|
|
// Do this in one allocation so that they're not interleaved between threads
|
|
|
|
char *output = alloc_dynamic(segments_head * (3 * 4 + 1) + clipped_count * 4);
|
|
|
|
float *points = (float *) output;
|
|
|
|
int *ids = (int *) (output + segments_head * 4 * 2);
|
|
|
|
unsigned char *pressures_res = (unsigned char *) (output + segments_head * 4 * 3);
|
|
|
|
unsigned int *batches = (unsigned int *) (output + segments_head * (4 * 3 + 1));
|
|
|
|
|
|
|
|
int phead = 0;
|
|
|
|
int ihead = 0;
|
|
|
|
float sqrt_zoom = __builtin_sqrtf(zoom);
|
|
|
|
int last_lod = -1;
|
|
|
|
|
|
|
|
for (int i = 0; i < clipped_count; ++i) {
|
|
|
|
int stroke_index = clipped_indices[i];
|
|
|
|
|
|
|
|
int base_stroke = stroke_coords_from[stroke_index];
|
|
|
|
int from = segments_from[i];
|
|
|
|
int to = segments_from[i + 1];
|
|
|
|
|
|
|
|
for (int j = from; j < to; ++j) {
|
|
|
|
int point_index = segments[j];
|
|
|
|
float x = xs[base_stroke + point_index];
|
|
|
|
float y = ys[base_stroke + point_index];
|
|
|
|
|
|
|
|
points[phead++] = x;
|
|
|
|
points[phead++] = y;
|
|
|
|
|
|
|
|
pressures_res[ihead] = pressures[base_stroke + point_index];
|
|
|
|
|
|
|
|
if (j != to - 1) {
|
|
|
|
ids[ihead++] = stroke_index;
|
|
|
|
} else {
|
|
|
|
ids[ihead++] = stroke_index | (1 << 31);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compute recommended LOD level, add to current batch or start new batch
|
|
|
|
float sqrt_width = __builtin_sqrtf(width[stroke_index]); // TOOD: pass in stroke width
|
|
|
|
int lod = __builtin_round(sqrt_zoom * sqrt_width * 0.3333f);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
if (__builtin_abs(lod - last_lod) > 2) {
|
|
|
|
// Start new batch
|
|
|
|
} else {
|
|
|
|
// Add to existing batch
|
|
|
|
}
|
|
|
|
|
|
|
|
last_lod = lod;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
result_buffer[0] = output;
|
|
|
|
result_count[0] = segments_head;
|
|
|
|
}
|
|
|
|
|
|
|
|
// NOT thread-safe, only call from one thread
|
|
|
|
char *
|
|
|
|
merge_results(int *segment_counts, char **buffers, int nthreads)
|
|
|
|
{
|
|
|
|
int total_segments = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < nthreads; ++i) {
|
|
|
|
total_segments += segment_counts[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
char *merged = alloc_dynamic(total_segments * (3 * 4 + 1));
|
|
|
|
|
|
|
|
float *points = (float *) merged;
|
|
|
|
int *ids = (int *) (merged + total_segments * 4 * 2);
|
|
|
|
unsigned char *pressures = (unsigned char *) (merged + total_segments * 4 * 3);
|
|
|
|
|
|
|
|
for (int i = 0; i < nthreads; ++i) {
|
|
|
|
int segments = segment_counts[i];
|
|
|
|
if (segments > 0) {
|
|
|
|
__builtin_memcpy(points, buffers[i], segments * 4 * 2);
|
|
|
|
__builtin_memcpy(ids, buffers[i] + segments * 4 * 2, segments * 4);
|
|
|
|
__builtin_memcpy(pressures, buffers[i] + segments * 4 * 3, segments);
|
|
|
|
|
|
|
|
points += segments * 2;
|
|
|
|
ids += segments;
|
|
|
|
pressures += segments;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
segment_counts[0] = total_segments;
|
|
|
|
|
|
|
|
return(merged);
|
|
|
|
}
|