Browse Source

SIMD fast path for skipping long multi-line comments. ~190MB/sec

master
A.Olokhtonov 2 years ago
parent
commit
ff1826e43e
  1. 20
      input.c
  2. 277
      main.c

20
input.c

@ -8,6 +8,8 @@ multi @@ -8,6 +8,8 @@ multi
line
comment */
/* short com */
123
0707
0xfAb
@ -71,10 +73,22 @@ u"\1\12\123" @@ -71,10 +73,22 @@ u"\1\12\123"
U"\xffaab"
L"\uffFF\xff"
[ ] ( ) { } . ->
[] ( ) { } . ->
++ -- & * + - ~ !
/ % << >> < > <= >=
/ %
<<
>>
<
>
<=
>=
? : ; ...
= *= /= %= += -= <<=
, # ##
<: :> <% %> %: %:%:
<:
:>
<%
%> %: %:%:
<stdio.h>
<stdio".h>

277
main.c

@ -10,8 +10,9 @@ @@ -10,8 +10,9 @@
#include <immintrin.h>
enum token_kind {
#define MAX(a, b) ((a) > (b) ? (a) : (b))
enum token_kind {
TOKEN_KEYWORD,
TOKEN_IDENTIFIER,
@ -58,41 +59,29 @@ advance(struct str *s, int by) @@ -58,41 +59,29 @@ advance(struct str *s, int by)
static int
nondigit(struct str s)
{
if (s.size) {
char c = s.text[0];
return(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'));
}
return(0);
char c = s.text[0];
return(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'));
}
static int
digit(struct str s)
{
if (s.size) {
char c = s.text[0];
return('0' <= c && c <= '9');
}
return(0);
char c = s.text[0];
return('0' <= c && c <= '9');
}
static int
nonzero_digit(struct str s)
{
if (s.size) {
char c = s.text[0];
return('1' <= c && c <= '9');
}
return(0);
char c = s.text[0];
return('1' <= c && c <= '9');
}
static int
octal_digit(struct str s)
{
if (s.size) {
char c = s.text[0];
return('0' <= c && c <= '7');
}
return(0);
char c = s.text[0];
return('0' <= c && c <= '7');
}
static int
@ -111,31 +100,22 @@ hexadecimal_prefix(struct str s) @@ -111,31 +100,22 @@ hexadecimal_prefix(struct str s)
static int
hexadecimal_digit(struct str s)
{
if (s.size) {
char c = s.text[0];
return(digit(s) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f'));
}
return(0);
char c = s.text[0];
return(digit(s) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f'));
}
static int
unsigned_suffix(struct str s)
{
if (s.size) {
char c = s.text[0];
return(c == 'u' || c == 'U');
}
return(0);
char c = s.text[0];
return(c == 'u' || c == 'U');
}
static int
long_suffix(struct str s)
{
if (s.size) {
char c = s.text[0];
return(c == 'l' || c == 'L');
}
return(0);
char c = s.text[0];
return(c == 'l' || c == 'L');
}
static int
@ -154,21 +134,15 @@ long_long_suffix(struct str s) @@ -154,21 +134,15 @@ long_long_suffix(struct str s)
static int
sign(struct str s)
{
if (s.size) {
char c = s.text[0];
return(c == '+' || c == '-');
}
return(0);
char c = s.text[0];
return(c == '+' || c == '-');
}
static int
floating_suffix(struct str s)
{
if (s.size) {
char c = s.text[0];
return(c == 'f' || c == 'l' || c == 'F' || c == 'L');
}
return(0);
char c = s.text[0];
return(c == 'f' || c == 'l' || c == 'F' || c == 'L');
}
static int
@ -291,11 +265,7 @@ hex_quad(struct str s) @@ -291,11 +265,7 @@ hex_quad(struct str s)
static int
universal_character_name(struct str s)
{
if (s.size < 2) {
return(0);
}
if (s.text[0] == '\\') {
if (s.size >= 2 && s.text[0] == '\\') {
if (s.text[1] == 'u') {
advance(&s, 2);
int hq = hex_quad(s);
@ -382,20 +352,18 @@ octal_constant(struct str s) @@ -382,20 +352,18 @@ octal_constant(struct str s)
{
int start = s.size;
if (s.size) {
if (s.text[0] == '0') {
advance(&s, 1);
if (s.text[0] == '0') {
advance(&s, 1);
for (;;) {
if (octal_digit(s)) {
advance(&s, 1);
} else {
break;
}
for (;;) {
if (octal_digit(s)) {
advance(&s, 1);
} else {
break;
}
return(start - s.size);
}
return(start - s.size);
}
return(0);
@ -471,16 +439,14 @@ fractional_constant(struct str s) @@ -471,16 +439,14 @@ fractional_constant(struct str s)
s.text += ds1;
s.size -= ds1;
if (s.size) {
if (s.text[0] == '.') {
advance(&s, 1);
if (s.text[0] == '.') {
advance(&s, 1);
int ds2 = digit_sequence(s);
advance(&s, ds2);
int ds2 = digit_sequence(s);
advance(&s, ds2);
if (ds1 > 0 || ds2 > 0) {
return(ds1 + ds2 + 1);
}
if (ds1 > 0 || ds2 > 0) {
return(ds1 + ds2 + 1);
}
}
@ -492,18 +458,16 @@ exponent_part(struct str s) @@ -492,18 +458,16 @@ exponent_part(struct str s)
{
int start = s.size;
if (s.size) {
if (s.text[0] == 'e' || s.text[0] == 'E') {
advance(&s, 1);
if (s.text[0] == 'e' || s.text[0] == 'E') {
advance(&s, 1);
int sgn = sign(s);
advance(&s, sgn);
int sgn = sign(s);
advance(&s, sgn);
int ds = digit_sequence(s);
if (ds) {
advance(&s, ds);
return(start - s.size);
}
int ds = digit_sequence(s);
if (ds) {
advance(&s, ds);
return(start - s.size);
}
}
@ -564,7 +528,7 @@ hexadecimal_fractional_constant(struct str s) @@ -564,7 +528,7 @@ hexadecimal_fractional_constant(struct str s)
advance(&s, hds1);
if (s.size && s.text[0] == '.') {
if (s.text[0] == '.') {
advance(&s, 1);
int hds2 = hexadecimal_digit_sequence(s);
@ -582,7 +546,7 @@ binary_exponent_part(struct str s) @@ -582,7 +546,7 @@ binary_exponent_part(struct str s)
{
int start = s.size;
if (s.size && (s.text[0] == 'p' || s.text[0] == 'P')) {
if (s.text[0] == 'p' || s.text[0] == 'P') {
advance(&s, 1);
int sgn = sign(s);
@ -673,34 +637,23 @@ c_char(struct str s) @@ -673,34 +637,23 @@ c_char(struct str s)
return(sym);
}
if (s.size) {
char c = s.text[0];
return(c != '\'' && c != '\\' && c != '\n');
}
char c = s.text[0];
return(0);
return(c != '\'' && c != '\\' && c != '\n');
}
static int
h_char(struct str s)
{
if (s.size) {
char c = s.text[0];
return(c != '\n' && c != '>');
}
return(0);
char c = s.text[0];
return(c != '\n' && c != '>');
}
static int
q_char(struct str s)
{
if (s.size) {
char c = s.text[0];
return(c != '\n' && c != '\"');
}
return(0);
char c = s.text[0];
return(c != '\n' && c != '\"');
}
static int
@ -732,7 +685,7 @@ character_constant(struct str s) @@ -732,7 +685,7 @@ character_constant(struct str s)
int start = s.size;
int ok = 0;
if (s.size && s.text[0] == '\'') {
if (s.text[0] == '\'') {
advance(&s, 1);
ok = 1;
} else if (s.size >= 2 && s.text[0] == 'L' && s.text[1] == '\'') {
@ -750,7 +703,7 @@ character_constant(struct str s) @@ -750,7 +703,7 @@ character_constant(struct str s)
int ccs = c_char_sequence(s);
if (ccs) {
advance(&s, ccs);
if (s.size && s.text[0] == '\'') {
if (s.text[0] == '\'') {
advance(&s, 1);
return(start - s.size);
}
@ -777,8 +730,9 @@ static int @@ -777,8 +730,9 @@ static int
whitespace(struct str s)
{
int start = s.size;
//int spaces = 0x20090a0d;
while (s.size && (s.text[0] == ' ' || s.text[0] == '\t' || s.text[0] == '\n' || s.text[0] == '\r')) {
while (s.text[0] == ' ' || s.text[0] == '\t' || s.text[0] == '\n' || s.text[0] == '\r') {
advance(&s, 1);
}
@ -794,12 +748,8 @@ s_char(struct str s) @@ -794,12 +748,8 @@ s_char(struct str s)
return(sym);
}
if (s.size) {
char c = s.text[0];
return(c != '\"' && c != '\\' && c != '\n');
}
return(0);
char c = s.text[0];
return(c != '\"' && c != '\\' && c != '\n');
}
static int
@ -832,14 +782,8 @@ encoding_prefix(struct str s) @@ -832,14 +782,8 @@ encoding_prefix(struct str s)
return(2);
}
if (s.size) {
char c = s.text[0];
if (c == 'u' || c == 'U' || c == 'L') {
return(1);
}
}
return(0);
char c = s.text[0];
return(c == 'u' || c == 'U' || c == 'L');
}
static int
@ -850,13 +794,13 @@ string_literal(struct str s) @@ -850,13 +794,13 @@ string_literal(struct str s)
int ep = encoding_prefix(s);
advance(&s, ep);
if (s.size && s.text[0] == '\"') {
if (s.text[0] == '\"') {
advance(&s, 1);
int scs = s_char_sequence(s);
advance(&s, scs);
if (s.size && s.text[0] == '\"') {
if (s.text[0] == '\"') {
advance(&s, 1);
return(start - s.size);
}
@ -908,18 +852,16 @@ punctuator(struct str s) @@ -908,18 +852,16 @@ punctuator(struct str s)
}
}
if (s.size) {
char c = s.text[0];
if (c == '[' || c == ']' || c == '(' || c == ')' ||
c == '{' || c == '}' || c == '.' || c == '&' ||
c == '*' || c == '+' || c == '-' || c == '~' ||
c == '!' || c == '/' || c == '%' || c == '<' ||
c == '>' || c == '^' || c == '|' || c == '?' ||
c == ':' || c == ';' || c == '=' || c == ',' ||
c == '#')
{
return(1);
}
char c = s.text[0];
if (c == '[' || c == ']' || c == '(' || c == ')' ||
c == '{' || c == '}' || c == '.' || c == '&' ||
c == '*' || c == '+' || c == '-' || c == '~' ||
c == '!' || c == '/' || c == '%' || c == '<' ||
c == '>' || c == '^' || c == '|' || c == '?' ||
c == ':' || c == ';' || c == '=' || c == ',' ||
c == '#')
{
return(1);
}
return(0);
@ -974,27 +916,27 @@ q_char_sequence(struct str s) @@ -974,27 +916,27 @@ q_char_sequence(struct str s)
static int
header_name(struct str s)
{
if (s.size && s.text[0] == '<') {
if (s.text[0] == '<') {
advance(&s, 1);
int hcs = h_char_sequence(s);
if (hcs) {
advance(&s, hcs);
if (s.size && s.text[0] == '>') {
if (s.text[0] == '>') {
return(hcs + 2);
}
}
}
if (s.size && s.text[0] == '\"') {
if (s.text[0] == '\"') {
advance(&s, 1);
int qcs = q_char_sequence(s);
if (qcs) {
advance(&s, qcs);
if (s.size && s.text[0] == '\"') {
if (s.text[0] == '\"') {
return(qcs + 2);
}
}
@ -1028,6 +970,28 @@ comment(struct str s) @@ -1028,6 +970,28 @@ comment(struct str s)
if (s.text[0] == '/' && s.text[1] == '*') {
/* multi-line comment */
advance(&s, 2);
__m128i mask = _mm_setr_epi8('*', '/', '*', '/', '*', '/', '*', '/', '*', '/', '*', '/', '*', '/', '*', '/');
__m128i mask_sus = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '*');
while (s.size > 16) {
__m128i chunk = _mm_loadu_si128((__m128i *)(s.text));
__m128i v1 = _mm_cmpeq_epi16(chunk, mask);
__m128i v2 = _mm_cmpeq_epi16(_mm_bslli_si128(chunk, 1), mask);
__m128i v3 = _mm_cmpeq_epi8(chunk, mask_sus);
__m128i v12 = _mm_or_si128(v1, v2);
__m128i v123 = _mm_or_si128(v12, v3);
if (!_mm_testz_si128(v123, v123)) {
break;
}
advance(&s, 16);
}
while (s.size) {
if (s.size >= 2 && s.text[0] == '*' && s.text[1] == '/') {
advance(&s, 2);
@ -1057,36 +1021,42 @@ lex(char *text, int size) @@ -1057,36 +1021,42 @@ lex(char *text, int size)
if ((sym = comment(s))) {
//printf("Comment: ");
} else if ((sym = constant(s))) {
//printf("Constant: ");
//printf("%.*s\n", sym, s.text);
} else {
int sym_constant = constant(s);
int sym_punctuator = punctuator(s);
int sym_string = string_literal(s);
int sym_header = header_name(s);
int sym_identifier = identifier(s);
sym = MAX(sym_constant, MAX(sym_punctuator, MAX(sym_string, MAX(sym_header, sym_identifier))));
#if 0
} else if ((sym == pp_number(s))) {
printf("PP number: ");
if (sym == sym_constant) {
printf("Constant: ");
} else if (sym == sym_punctuator) {
printf("Punctuator: ");
} else if (sym == sym_string) {
printf("String: ");
} else if (sym == sym_header) {
printf("Header: ");
} else if (sym == sym_identifier) {
printf("Identifier: ");
}
#endif
} else if ((sym = punctuator(s))) {
//printf("Punctuator: ");
} else if ((sym = string_literal(s))) {
//printf("String literal: ");
} else if ((sym = header_name(s))) {
//printf("Header name: ");
} else if ((sym = identifier(s))) {
//printf("Identifier: ");
}
//printf("%.*s\n", sym, s.text);
if (sym) {
advance(&s, sym);
} else if (s.size) {
} else if (s.size == 1 && s.text[0] == '\0') {
break;
} else {
fprintf(stderr, "Error!\n");
break;
}
}
return(NULL);
}
@ -1169,6 +1139,13 @@ main(int argc, char **argv) @@ -1169,6 +1139,13 @@ main(int argc, char **argv)
int size = (int) sb.st_size;
char *data = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (size && data[size - 1] != '\n') {
fprintf(stderr, "No terminating new line. Fuck you!\n");
return(1);
}
data[size - 1] = '\0';
if (data == MAP_FAILED) {
perror("mmap");
return(1);

Loading…
Cancel
Save