From ff1826e43efff831b7a15ae354fa89dcef0f709c Mon Sep 17 00:00:00 2001 From: "A.Olokhtonov" Date: Fri, 17 Jun 2022 01:10:17 +0300 Subject: [PATCH] SIMD fast path for skipping long multi-line comments. ~190MB/sec --- input.c | 20 +++- main.c | 287 ++++++++++++++++++++++++++------------------------------ 2 files changed, 149 insertions(+), 158 deletions(-) diff --git a/input.c b/input.c index baf835c..b155c1c 100644 --- a/input.c +++ b/input.c @@ -8,6 +8,8 @@ multi line comment */ +/* short com */ + 123 0707 0xfAb @@ -71,10 +73,22 @@ u"\1\12\123" U"\xffaab" L"\uffFF\xff" -[ ] ( ) { } . -> +[] ( ) { } . -> ++ -- & * + - ~ ! -/ % << >> < > <= >= +/ % +<< +>> +< +> +<= +>= ? : ; ... = *= /= %= += -= <<= , # ## -<: :> <% %> %: %:%: +<: +:> +<% +%> %: %:%: + + + diff --git a/main.c b/main.c index f816df0..aa5c903 100644 --- a/main.c +++ b/main.c @@ -10,8 +10,9 @@ #include +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + enum token_kind { - TOKEN_KEYWORD, TOKEN_IDENTIFIER, @@ -58,41 +59,29 @@ advance(struct str *s, int by) static int nondigit(struct str s) { - if (s.size) { - char c = s.text[0]; - return(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_')); - } - return(0); + char c = s.text[0]; + return(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_')); } static int digit(struct str s) { - if (s.size) { - char c = s.text[0]; - return('0' <= c && c <= '9'); - } - return(0); + char c = s.text[0]; + return('0' <= c && c <= '9'); } static int nonzero_digit(struct str s) { - if (s.size) { - char c = s.text[0]; - return('1' <= c && c <= '9'); - } - return(0); + char c = s.text[0]; + return('1' <= c && c <= '9'); } static int octal_digit(struct str s) { - if (s.size) { - char c = s.text[0]; - return('0' <= c && c <= '7'); - } - return(0); + char c = s.text[0]; + return('0' <= c && c <= '7'); } static int @@ -111,31 +100,22 @@ hexadecimal_prefix(struct str s) static int hexadecimal_digit(struct str s) { - if (s.size) { - char c = s.text[0]; - return(digit(s) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f')); - } - return(0); + char c = s.text[0]; + return(digit(s) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f')); } static int unsigned_suffix(struct str s) { - if (s.size) { - char c = s.text[0]; - return(c == 'u' || c == 'U'); - } - return(0); + char c = s.text[0]; + return(c == 'u' || c == 'U'); } static int long_suffix(struct str s) { - if (s.size) { - char c = s.text[0]; - return(c == 'l' || c == 'L'); - } - return(0); + char c = s.text[0]; + return(c == 'l' || c == 'L'); } static int @@ -154,21 +134,15 @@ long_long_suffix(struct str s) static int sign(struct str s) { - if (s.size) { - char c = s.text[0]; - return(c == '+' || c == '-'); - } - return(0); + char c = s.text[0]; + return(c == '+' || c == '-'); } static int floating_suffix(struct str s) { - if (s.size) { - char c = s.text[0]; - return(c == 'f' || c == 'l' || c == 'F' || c == 'L'); - } - return(0); + char c = s.text[0]; + return(c == 'f' || c == 'l' || c == 'F' || c == 'L'); } static int @@ -291,11 +265,7 @@ hex_quad(struct str s) static int universal_character_name(struct str s) { - if (s.size < 2) { - return(0); - } - - if (s.text[0] == '\\') { + if (s.size >= 2 && s.text[0] == '\\') { if (s.text[1] == 'u') { advance(&s, 2); int hq = hex_quad(s); @@ -382,20 +352,18 @@ octal_constant(struct str s) { int start = s.size; - if (s.size) { - if (s.text[0] == '0') { - advance(&s, 1); - - for (;;) { - if (octal_digit(s)) { - advance(&s, 1); - } else { - break; - } + if (s.text[0] == '0') { + advance(&s, 1); + + for (;;) { + if (octal_digit(s)) { + advance(&s, 1); + } else { + break; } - - return(start - s.size); } + + return(start - s.size); } return(0); @@ -471,16 +439,14 @@ fractional_constant(struct str s) s.text += ds1; s.size -= ds1; - if (s.size) { - if (s.text[0] == '.') { - advance(&s, 1); - - int ds2 = digit_sequence(s); - advance(&s, ds2); - - if (ds1 > 0 || ds2 > 0) { - return(ds1 + ds2 + 1); - } + if (s.text[0] == '.') { + advance(&s, 1); + + int ds2 = digit_sequence(s); + advance(&s, ds2); + + if (ds1 > 0 || ds2 > 0) { + return(ds1 + ds2 + 1); } } @@ -492,18 +458,16 @@ exponent_part(struct str s) { int start = s.size; - if (s.size) { - if (s.text[0] == 'e' || s.text[0] == 'E') { - advance(&s, 1); - - int sgn = sign(s); - advance(&s, sgn); - - int ds = digit_sequence(s); - if (ds) { - advance(&s, ds); - return(start - s.size); - } + if (s.text[0] == 'e' || s.text[0] == 'E') { + advance(&s, 1); + + int sgn = sign(s); + advance(&s, sgn); + + int ds = digit_sequence(s); + if (ds) { + advance(&s, ds); + return(start - s.size); } } @@ -564,7 +528,7 @@ hexadecimal_fractional_constant(struct str s) advance(&s, hds1); - if (s.size && s.text[0] == '.') { + if (s.text[0] == '.') { advance(&s, 1); int hds2 = hexadecimal_digit_sequence(s); @@ -582,7 +546,7 @@ binary_exponent_part(struct str s) { int start = s.size; - if (s.size && (s.text[0] == 'p' || s.text[0] == 'P')) { + if (s.text[0] == 'p' || s.text[0] == 'P') { advance(&s, 1); int sgn = sign(s); @@ -673,34 +637,23 @@ c_char(struct str s) return(sym); } - if (s.size) { - char c = s.text[0]; - return(c != '\'' && c != '\\' && c != '\n'); - } + char c = s.text[0]; - return(0); + return(c != '\'' && c != '\\' && c != '\n'); } static int h_char(struct str s) { - if (s.size) { - char c = s.text[0]; - return(c != '\n' && c != '>'); - } - - return(0); + char c = s.text[0]; + return(c != '\n' && c != '>'); } static int q_char(struct str s) { - if (s.size) { - char c = s.text[0]; - return(c != '\n' && c != '\"'); - } - - return(0); + char c = s.text[0]; + return(c != '\n' && c != '\"'); } static int @@ -732,7 +685,7 @@ character_constant(struct str s) int start = s.size; int ok = 0; - if (s.size && s.text[0] == '\'') { + if (s.text[0] == '\'') { advance(&s, 1); ok = 1; } else if (s.size >= 2 && s.text[0] == 'L' && s.text[1] == '\'') { @@ -750,7 +703,7 @@ character_constant(struct str s) int ccs = c_char_sequence(s); if (ccs) { advance(&s, ccs); - if (s.size && s.text[0] == '\'') { + if (s.text[0] == '\'') { advance(&s, 1); return(start - s.size); } @@ -777,8 +730,9 @@ static int whitespace(struct str s) { int start = s.size; + //int spaces = 0x20090a0d; - while (s.size && (s.text[0] == ' ' || s.text[0] == '\t' || s.text[0] == '\n' || s.text[0] == '\r')) { + while (s.text[0] == ' ' || s.text[0] == '\t' || s.text[0] == '\n' || s.text[0] == '\r') { advance(&s, 1); } @@ -794,12 +748,8 @@ s_char(struct str s) return(sym); } - if (s.size) { - char c = s.text[0]; - return(c != '\"' && c != '\\' && c != '\n'); - } - - return(0); + char c = s.text[0]; + return(c != '\"' && c != '\\' && c != '\n'); } static int @@ -832,14 +782,8 @@ encoding_prefix(struct str s) return(2); } - if (s.size) { - char c = s.text[0]; - if (c == 'u' || c == 'U' || c == 'L') { - return(1); - } - } - - return(0); + char c = s.text[0]; + return(c == 'u' || c == 'U' || c == 'L'); } static int @@ -850,13 +794,13 @@ string_literal(struct str s) int ep = encoding_prefix(s); advance(&s, ep); - if (s.size && s.text[0] == '\"') { + if (s.text[0] == '\"') { advance(&s, 1); int scs = s_char_sequence(s); advance(&s, scs); - if (s.size && s.text[0] == '\"') { + if (s.text[0] == '\"') { advance(&s, 1); return(start - s.size); } @@ -908,18 +852,16 @@ punctuator(struct str s) } } - if (s.size) { - char c = s.text[0]; - if (c == '[' || c == ']' || c == '(' || c == ')' || - c == '{' || c == '}' || c == '.' || c == '&' || - c == '*' || c == '+' || c == '-' || c == '~' || - c == '!' || c == '/' || c == '%' || c == '<' || - c == '>' || c == '^' || c == '|' || c == '?' || - c == ':' || c == ';' || c == '=' || c == ',' || - c == '#') - { - return(1); - } + char c = s.text[0]; + if (c == '[' || c == ']' || c == '(' || c == ')' || + c == '{' || c == '}' || c == '.' || c == '&' || + c == '*' || c == '+' || c == '-' || c == '~' || + c == '!' || c == '/' || c == '%' || c == '<' || + c == '>' || c == '^' || c == '|' || c == '?' || + c == ':' || c == ';' || c == '=' || c == ',' || + c == '#') + { + return(1); } return(0); @@ -974,27 +916,27 @@ q_char_sequence(struct str s) static int header_name(struct str s) { - if (s.size && s.text[0] == '<') { + if (s.text[0] == '<') { advance(&s, 1); int hcs = h_char_sequence(s); if (hcs) { advance(&s, hcs); - if (s.size && s.text[0] == '>') { + if (s.text[0] == '>') { return(hcs + 2); } } } - if (s.size && s.text[0] == '\"') { + if (s.text[0] == '\"') { advance(&s, 1); int qcs = q_char_sequence(s); if (qcs) { advance(&s, qcs); - if (s.size && s.text[0] == '\"') { + if (s.text[0] == '\"') { return(qcs + 2); } } @@ -1028,6 +970,28 @@ comment(struct str s) if (s.text[0] == '/' && s.text[1] == '*') { /* multi-line comment */ + + advance(&s, 2); + + __m128i mask = _mm_setr_epi8('*', '/', '*', '/', '*', '/', '*', '/', '*', '/', '*', '/', '*', '/', '*', '/'); + + __m128i mask_sus = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '*'); + + while (s.size > 16) { + __m128i chunk = _mm_loadu_si128((__m128i *)(s.text)); + __m128i v1 = _mm_cmpeq_epi16(chunk, mask); + __m128i v2 = _mm_cmpeq_epi16(_mm_bslli_si128(chunk, 1), mask); + __m128i v3 = _mm_cmpeq_epi8(chunk, mask_sus); + __m128i v12 = _mm_or_si128(v1, v2); + __m128i v123 = _mm_or_si128(v12, v3); + + if (!_mm_testz_si128(v123, v123)) { + break; + } + + advance(&s, 16); + } + while (s.size) { if (s.size >= 2 && s.text[0] == '*' && s.text[1] == '/') { advance(&s, 2); @@ -1057,36 +1021,42 @@ lex(char *text, int size) if ((sym = comment(s))) { //printf("Comment: "); - } else if ((sym = constant(s))) { - //printf("Constant: "); + //printf("%.*s\n", sym, s.text); + } else { + int sym_constant = constant(s); + int sym_punctuator = punctuator(s); + int sym_string = string_literal(s); + int sym_header = header_name(s); + int sym_identifier = identifier(s); + sym = MAX(sym_constant, MAX(sym_punctuator, MAX(sym_string, MAX(sym_header, sym_identifier)))); #if 0 - } else if ((sym == pp_number(s))) { - printf("PP number: "); + if (sym == sym_constant) { + printf("Constant: "); + } else if (sym == sym_punctuator) { + printf("Punctuator: "); + } else if (sym == sym_string) { + printf("String: "); + } else if (sym == sym_header) { + printf("Header: "); + } else if (sym == sym_identifier) { + printf("Identifier: "); + } #endif - } else if ((sym = punctuator(s))) { - //printf("Punctuator: "); - } else if ((sym = string_literal(s))) { - //printf("String literal: "); - } else if ((sym = header_name(s))) { - //printf("Header name: "); - } else if ((sym = identifier(s))) { - //printf("Identifier: "); } - //printf("%.*s\n", sym, s.text); - if (sym) { advance(&s, sym); - } else if (s.size) { + } else if (s.size == 1 && s.text[0] == '\0') { + break; + } else { fprintf(stderr, "Error!\n"); break; } } - return(NULL); } @@ -1169,6 +1139,13 @@ main(int argc, char **argv) int size = (int) sb.st_size; char *data = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (size && data[size - 1] != '\n') { + fprintf(stderr, "No terminating new line. Fuck you!\n"); + return(1); + } + + data[size - 1] = '\0'; + if (data == MAP_FAILED) { perror("mmap"); return(1);