# include <wasm_simd128.h>
extern char __heap_base ;
static int allocated_static ;
static int allocated_dynamic ;
void
set_sp ( char * sp )
{
__asm__ __volatile__ (
" .globaltype __stack_pointer, i32 \n "
" local.get %0 \n "
" global.set __stack_pointer \n "
: : " r " ( sp )
) ;
}
void
free_static ( void )
{
allocated_static = 0 ;
}
void
free_dynamic ( void )
{
allocated_dynamic = 0 ;
}
void *
alloc_static ( int size )
{
// This IS NOT thread-safe
void * result = & __heap_base + allocated_static ;
allocated_static + = size ;
return ( result ) ;
}
static int
round_to_pow2 ( int value , int multiple )
{
return ( ( value + multiple - 1 ) & - multiple ) ;
}
void *
alloc_dynamic ( int size )
{
// Very ad-van-ced thread-safe allocator
// CAN be called from multiple threads
size = round_to_pow2 ( size , 4 ) ;
int original_allocated_dynamic = __atomic_fetch_add ( & allocated_dynamic , size , __ATOMIC_SEQ_CST ) ;
void * result = & __heap_base + allocated_static + original_allocated_dynamic ;
return ( result ) ;
}
static int
rdp_find_max ( float * xs , float * ys , unsigned char * pressures , float zoom , int coords_from ,
int segment_start , int segment_end )
{
int result = - 1 ;
if ( segment_start = = segment_end ) {
return ( result ) ;
}
float EPS = 0.125f / zoom * 255.0f ;
float max_dist = 0.0f ;
float ax = xs [ coords_from + segment_start ] ;
float ay = ys [ coords_from + segment_start ] ;
float bx = xs [ coords_from + segment_end ] ;
float by = ys [ coords_from + segment_end ] ;
unsigned char ap = pressures [ coords_from / 2 + segment_start ] ;
unsigned char bp = pressures [ coords_from / 2 + segment_end ] ;
float dx = bx - ax ;
float dy = by - ay ;
float dist_ab = __builtin_sqrtf ( dx * dx + dy * dy ) ;
float dir_nx = dy / dist_ab * 255.0f ;
float dir_ny = - dx / dist_ab * 255.0f ;
#if 0
// Scalar version preserved for reference
for ( int i = segment_start + 1 ; i < segment_end ; + + i ) {
float px = xs [ coords_from + i ] ;
float py = ys [ coords_from + i ] ;
unsigned char pp = pressures [ coords_from + i ] ;
float apx = px - ax ;
float apy = py - ay ;
float dist = __builtin_fabsf ( apx * dir_nx + apy * dir_ny )
+ __builtin_abs ( pp - ap ) + __builtin_abs ( pp - bp ) ;
if ( dist > EPS & & dist > max_dist ) {
result = i ;
max_dist = dist ;
}
}
# else
v128_t ax_x4 = wasm_f32x4_splat ( ax ) ;
v128_t ay_x4 = wasm_f32x4_splat ( ay ) ;
v128_t ap_x4 = wasm_f32x4_splat ( ap ) ;
v128_t bp_x4 = wasm_f32x4_splat ( bp ) ;
v128_t dir_nx_x4 = wasm_f32x4_splat ( dir_nx ) ;
v128_t dir_ny_x4 = wasm_f32x4_splat ( dir_ny ) ;
v128_t index_x4 = wasm_u32x4_make ( segment_start + 1 , segment_start + 2 , segment_start + 3 , segment_start + 4 ) ;
v128_t four_x4 = wasm_u32x4_const_splat ( 4 ) ;
v128_t max_dist_x4 = wasm_f32x4_splat ( EPS ) ;
v128_t max_index_x4 = wasm_u32x4_const_splat ( - 1 ) ;
for ( int i = segment_start + 1 ; i < segment_end - 3 ; i + = 4 ) {
v128_t px_x4 = wasm_v128_load ( xs + coords_from + i ) ;
v128_t py_x4 = wasm_v128_load ( ys + coords_from + i ) ;
v128_t pp_x4 = wasm_f32x4_make (
pressures [ coords_from / 2 + i + 0 ] ,
pressures [ coords_from / 2 + i + 1 ] ,
pressures [ coords_from / 2 + i + 2 ] ,
pressures [ coords_from / 2 + i + 3 ]
) ;
v128_t apx_x4 = wasm_f32x4_sub ( px_x4 , ax_x4 ) ;
v128_t apy_x4 = wasm_f32x4_sub ( py_x4 , ay_x4 ) ;
v128_t dist_x4 = wasm_f32x4_add (
wasm_f32x4_add (
wasm_f32x4_abs ( wasm_f32x4_sub ( pp_x4 , ap_x4 ) ) ,
wasm_f32x4_abs ( wasm_f32x4_sub ( pp_x4 , bp_x4 ) )
) ,
wasm_f32x4_abs (
wasm_f32x4_add (
wasm_f32x4_mul ( apx_x4 , dir_nx_x4 ) ,
wasm_f32x4_mul ( apy_x4 , dir_ny_x4 )
)
)
) ;
v128_t mask = wasm_f32x4_gt ( dist_x4 , max_dist_x4 ) ;
max_index_x4 = wasm_v128_bitselect ( index_x4 , max_index_x4 , mask ) ;
max_dist_x4 = wasm_v128_bitselect ( dist_x4 , max_dist_x4 , mask ) ;
index_x4 = wasm_i32x4_add ( index_x4 , four_x4 ) ;
}
int indices [ 4 ] ;
float values [ 4 ] ;
wasm_v128_store ( indices , max_index_x4 ) ;
wasm_v128_store ( values , max_dist_x4 ) ;
for ( int i = 0 ; i < 4 ; + + i ) {
if ( indices [ i ] ! = - 1 ) {
if ( values [ i ] > max_dist ) {
result = indices [ i ] ;
max_dist = values [ i ] ;
}
}
}
if ( max_dist = = EPS ) {
max_dist = 0.0f ;
result = - 1 ;
}
int remainder = ( segment_end - segment_start - 1 ) % 4 ;
for ( int i = segment_end - remainder ; i < segment_end ; + + i ) {
float px = xs [ coords_from + i ] ;
float py = ys [ coords_from + i ] ;
unsigned char pp = pressures [ coords_from + i ] ;
float apx = px - ax ;
float apy = py - ay ;
float dist = __builtin_fabsf ( apx * dir_nx + apy * dir_ny )
+ __builtin_abs ( pp - ap ) + __builtin_abs ( pp - bp ) ;
if ( dist > EPS & & dist > max_dist ) {
result = i ;
max_dist = dist ;
}
}
# endif
return ( result ) ;
}
void
do_lod ( int * clipped_indices , int clipped_count , float zoom ,
int * stroke_coords_from ,
int * width ,
float * xs ,
float * ys ,
unsigned char * pressures ,
char * * result_buffer ,
int * result_count ,
int * result_batch_count )
{
if ( clipped_count = = 0 ) {
result_count [ 0 ] = 0 ;
return ;
}
int first_stroke = clipped_indices [ 0 ] ;
int last_stroke = clipped_indices [ clipped_count - 1 ] ;
int total_points = 0 ;
for ( int i = 0 ; i < clipped_count ; + + i ) {
int stroke_index = clipped_indices [ i ] ;
total_points + = stroke_coords_from [ stroke_index + 1 ] - stroke_coords_from [ stroke_index ] ;
}
int * segments_from = alloc_dynamic ( ( clipped_count + 1 ) * 4 ) ;
int * segments = alloc_dynamic ( total_points * 4 ) ; // TODO: this is a very conservative estimate, we can lower memory usage if we get this tighter
int segments_head = 0 ;
int stack [ 4096 ] ; // TODO: what's a reasonable max size for this?
int max_stack_size = 0 ;
for ( int i = 0 ; i < clipped_count ; + + i ) {
int stroke_index = clipped_indices [ i ] ;
// TODO: convert to a proper CSR, save half the memory
int coords_from = stroke_coords_from [ stroke_index ] ;
int coords_to = stroke_coords_from [ stroke_index + 1 ] ;
int point_count = coords_to - coords_from ;
// Basic CSR crap
segments_from [ i ] = segments_head ;
int segment_count = 2 ;
int stack_head = 0 ;
segments [ segments_head + + ] = 0 ;
stack [ stack_head + + ] = 0 ;
stack [ stack_head + + ] = 0 ;
stack [ stack_head + + ] = point_count - 1 ;
while ( stack_head > 0 ) {
if ( stack_head > max_stack_size ) { max_stack_size = stack_head ; }
int end = stack [ - - stack_head ] ;
int start = stack [ - - stack_head ] ;
int type = stack [ - - stack_head ] ;
if ( type = = 1 ) {
segments [ segments_head + + ] = start ;
} else {
int max = rdp_find_max ( xs , ys , pressures , zoom , coords_from , start , end ) ;
if ( max ! = - 1 ) {
segment_count + = 1 ;
stack [ stack_head + + ] = 0 ;
stack [ stack_head + + ] = max ;
stack [ stack_head + + ] = end ;
stack [ stack_head + + ] = 1 ;
stack [ stack_head + + ] = max ;
stack [ stack_head + + ] = - 1 ;
stack [ stack_head + + ] = 0 ;
stack [ stack_head + + ] = start ;
stack [ stack_head + + ] = max ;
}
}
}
segments [ segments_head + + ] = point_count - 1 ;
}
segments_from [ clipped_count ] = segments_head ;
// Write actual coordinates (points) and stroke ids
// Do this in one allocation so that they're not interleaved between threads
char * output = alloc_dynamic ( round_to_pow2 ( segments_head * ( 3 * 4 + 1 ) , 4 ) + clipped_count * 4 * 2 ) ; // max two ints per stroke for batch info (realistically, much less)
float * points = ( float * ) output ;
int * ids = ( int * ) ( output + segments_head * 4 * 2 ) ;
unsigned char * pressures_res = ( unsigned char * ) ( output + segments_head * 4 * 3 ) ;
int * batches = ( int * ) ( output + round_to_pow2 ( segments_head * ( 4 * 3 + 1 ) , 4 ) ) ;
int phead = 0 ;
int ihead = 0 ;
float sqrt_zoom = __builtin_sqrtf ( zoom ) ;
int last_lod = - 100 ;
int batch_count = 0 ;
int batch_size = 0 ;
for ( int i = 0 ; i < clipped_count ; + + i ) {
int stroke_index = clipped_indices [ i ] ;
int base_stroke = stroke_coords_from [ stroke_index ] ;
int from = segments_from [ i ] ;
int to = segments_from [ i + 1 ] ;
for ( int j = from ; j < to ; + + j ) {
int point_index = segments [ j ] ;
float x = xs [ base_stroke + point_index ] ;
float y = ys [ base_stroke + point_index ] ;
points [ phead + + ] = x ;
points [ phead + + ] = y ;
pressures_res [ ihead ] = pressures [ base_stroke + point_index ] ;
if ( j ! = to - 1 ) {
ids [ ihead + + ] = stroke_index ;
} else {
ids [ ihead + + ] = stroke_index | ( 1 < < 31 ) ;
}
}
int segment_count = to - from ;
// Compute recommended LOD level, add to current batch or start new batch
float sqrt_width = __builtin_sqrtf ( width [ stroke_index ] ) ;
int lod = __builtin_ceil ( sqrt_zoom * sqrt_width * 0.3333f ) ; // TODO: round
if ( lod > 7 ) lod = 7 ;
if ( batch_size > 0 & & __builtin_abs ( lod - last_lod ) > 2 ) {
// Start new batch
batches [ batch_count * 2 + 0 ] = batch_size ;
batches [ batch_count * 2 + 1 ] = last_lod ;
+ + batch_count ;
batch_size = 0 ;
}
batch_size + = segment_count ;
last_lod = lod ;
}
if ( batch_size > 0 ) {
batches [ batch_count * 2 + 0 ] = batch_size ;
batches [ batch_count * 2 + 1 ] = last_lod ;
+ + batch_count ;
}
result_buffer [ 0 ] = output ;
result_count [ 0 ] = segments_head ;
result_batch_count [ 0 ] = batch_count ;
}
// NOT thread-safe, only call from one thread
char *
merge_results ( int * segment_counts , int * batch_counts , char * * buffers , int nthreads )
{
int total_segments = 0 ;
int total_batches = 0 ;
for ( int i = 0 ; i < nthreads ; + + i ) {
total_segments + = segment_counts [ i ] ;
total_batches + = batch_counts [ i ] ;
}
char * merged = alloc_dynamic ( round_to_pow2 ( total_segments * ( 3 * 4 + 1 ) , 4 ) + total_batches * 4 ) ;
float * points = ( float * ) merged ;
int * ids = ( int * ) ( merged + total_segments * 4 * 2 ) ;
unsigned char * pressures = ( unsigned char * ) ( merged + total_segments * 4 * 3 ) ;
int * batches = ( int * ) ( merged + round_to_pow2 ( total_segments * ( 3 * 4 + 1 ) , 4 ) ) ;
int batch_base = 0 ;
int last_batch_lod = - 99 ;
int bhead = 0 ;
int written_batches = 0 ;
for ( int i = 0 ; i < nthreads ; + + i ) {
int segments = segment_counts [ i ] ;
int nbatches = batch_counts [ i ] ;
int * thread_batches = ( int * ) ( buffers [ i ] + round_to_pow2 ( segments * ( 4 * 3 + 1 ) , 4 ) ) ;
if ( segments > 0 ) {
__builtin_memcpy ( points , buffers [ i ] , segments * 4 * 2 ) ;
__builtin_memcpy ( ids , buffers [ i ] + segments * 4 * 2 , segments * 4 ) ;
__builtin_memcpy ( pressures , buffers [ i ] + segments * 4 * 3 , segments ) ;
for ( int j = 0 ; j < nbatches * 2 ; j + = 2 ) {
batches [ bhead + + ] = written_batches ;
batches [ bhead + + ] = thread_batches [ j + 1 ] ;
written_batches + = thread_batches [ j + 0 ] ;
}
points + = segments * 2 ;
ids + = segments ;
pressures + = segments ;
}
}
segment_counts [ 0 ] = total_segments ;
batch_counts [ 0 ] = total_batches ;
return ( merged ) ;
}