commit 00eb2f9ea7be0fd10f948d8077291fbdf1da0237 Author: A.Olokhtonov Date: Wed Jun 15 11:46:19 2022 +0300 Initial lexer impl: slow, incomplete, probably wrong. ~105MB/sec diff --git a/input.c b/input.c new file mode 100644 index 0000000..0d45f94 --- /dev/null +++ b/input.c @@ -0,0 +1,75 @@ +ident_ +i1dent +i\uffffent +i\UFFfFaaAadent + +123 +0707 +0xfAb +0XFaB + +123U +123UL +123ULL +123L +123LU +123LLU + +123u +123ul +123ull +123l +123lu +123llu + +10e5 +10E5 +10e5f +10e5F +10e5L + +10e-5 +10e-5f +10e-5F +10e-5L + +10e+5 +10e+5f +10e+5F +10e+5L + +123.0 +123. +.01 + +123.0e-2f +123.e-5L +.02E+5l + +0xfffp-2 +0xfffp+2 +0xFFFP2 + +'a' +'abcd' +L'b' +u'c' +U'd' +'\n\r\'\"\?\\\a\b\f\n\r\t\v' +'\1\12\123' +'\xffaab' +'\uffFF\xff' + +"string" +u8"\n\r\'\"\?\\\a\b\f\n\r\t\v" +u"\1\12\123" +U"\xffaab" +L"\uffFF\xff" + +[ ] ( ) { } . -> +++ -- & * + - ~ ! +/ % << >> < > <= >= +? : ; ... += *= /= %= += -= <<= +, # ## +<: :> <% %> %: %:%: \ No newline at end of file diff --git a/main.c b/main.c new file mode 100644 index 0000000..309852e --- /dev/null +++ b/main.c @@ -0,0 +1,1222 @@ +#include +#include + +#include +#include + +#include +#include +#include + +enum token_kind { + + TOKEN_KEYWORD, + TOKEN_IDENTIFIER, + + TOKEN_INTEGER_CONSTANT, + TOKEN_FLOATING_CONTANT, + TOKEN_ENUMERATION_CONSTANT, + TOKEN_CHARACTER_CONSTANT, + + TOKEN_STRING_LITERAL, + TOKEN_PUNCTUATOR, + TOKEN_HEADER_NAME, + TOKEN_PP_NUMBER, + + TOKEN_COUNT, +}; + +struct str { + char *text; + int size; +}; + +struct token { + enum token_kind kind; + char *start; + char *end; // one past end +}; + +static int +nondigit(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return((c == '_') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')); + } + return(0); +} + +static int +digit(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return('0' <= c && c <= '9'); + } + return(0); +} + +static int +nonzero_digit(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return('1' <= c && c <= '9'); + } + return(0); +} + +static int +octal_digit(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return('0' <= c && c <= '7'); + } + return(0); +} + +static int +hexadecimal_prefix(struct str s) +{ + if (s.size >= 2) { + if (s.text[0] == '0') { + if (s.text[1] == 'x' || s.text[1] == 'X') { + return(2); + } + } + } + return(0); +} + +static int +hexadecimal_digit(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(digit(s) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f')); + } + return(0); +} + +static int +unsigned_suffix(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(c == 'u' || c == 'U'); + } + return(0); +} + +static int +long_suffix(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(c == 'l' || c == 'L'); + } + return(0); +} + +static int +long_long_suffix(struct str s) +{ + if (s.size >= 2) { + char c1 = s.text[0]; + char c2 = s.text[1]; + if ((c1 == 'l' && c2 == 'l') || (c1 == 'L' && c2 == 'L')) { + return(2); + } + } + return(0); +} + +static int +sign(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(c == '+' || c == '-'); + } + return(0); +} + +static int +floating_suffix(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(c == 'f' || c == 'l' || c == 'F' || c == 'L'); + } + return(0); +} + +static int +integer_suffix(struct str s) +{ + int sym = 0; + + if ((sym = unsigned_suffix(s))) { + s.text += sym; + s.size -= sym; + + int ll = long_long_suffix(s); + + if (ll) { + return(sym + ll); + } + + return(sym + long_suffix(s)); + } else if ((sym = long_long_suffix(s))) { + s.text += sym; + s.size -= sym; + return(sym + unsigned_suffix(s)); + } else if ((sym = long_suffix(s))) { + s.text += sym; + s.size -= sym; + return(sym + unsigned_suffix(s)); + } + + return(0); +} + +static int +simple_escape_sequence(struct str s) +{ + if (s.size >= 2) { + char c1 = s.text[0]; + if (c1 == '\\') { + char c2 = s.text[1]; + if (c2 == '\'' || c2 == '\"' || c2 == '?' || + c2 == '\\' || c2 == 'a' || c2 == 'b' || + c2 == 'f' || c2 == 'n' || c2 == 'r' || + c2 == 't' || c2 == 'v') { + return(2); + } + } + } + return(0); +} + +static int +octal_escape_sequence(struct str s) +{ + int start = s.size; + + if (s.size && s.text[0] == '\\') { + s.text++; + s.size--; + + if (octal_digit(s)) { + s.text++; + s.size--; + + if (octal_digit(s)) { + s.text++; + s.size--; + + if (octal_digit(s)) { + s.text++; + s.size--; + } + } + + return(start - s.size); + } + } + + return(0); +} + +static int +hexadecimal_escape_sequence(struct str s) +{ + int start = s.size; + + if (s.size >= 2) { + if (s.text[0] == '\\' && s.text[1] == 'x') { + s.text += 2; + s.size -= 2; + + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + + for (;;) { + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + } else { + break; + } + } + + return(start - s.size); + } + } + } + + return(0); +} + +static int +hex_quad(struct str s) +{ + if (s.size >= 4) { + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + if (hexadecimal_digit(s)) { + return(4); + } + } + } + } + } + return(0); +} + +static int +universal_character_name(struct str s) +{ + if (s.size < 2) { + return(0); + } + + if (s.text[0] == '\\') { + if (s.text[1] == 'u') { + s.text += 2; + s.size -= 2; + int hq = hex_quad(s); + if (hq) { + return(2 + hq); + } + } else if (s.text[1] == 'U') { + s.text += 2; + s.size -= 2; + int hq1 = hex_quad(s); + if (hq1) { + s.text += hq1; + s.size -= hq1; + int hq2 = hex_quad(s); + if (hq2) { + return(2 + hq1 + hq2); + } + } + } + } + + return(0); +} + +static int +identifier_nondigit(struct str s) +{ + int sym = 0; + + if ((sym = nondigit(s)) || (sym = universal_character_name(s))) { + return(sym); + } + + return(0); +} + +static int +identifier(struct str s) +{ + int start = s.size; + int in = identifier_nondigit(s); + + if (in) { + s.text += in; + s.size -= in; + + int sym = 0; + + for (;;) { + if ((sym = identifier_nondigit(s)) || (sym = digit(s))) { + s.text += sym; + s.size -= sym; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +decimal_constant(struct str s) +{ + int start = s.size; + + if (nonzero_digit(s)) { + s.text++; + s.size--; + + for (;;) { + if (digit(s)) { + s.text++; + s.size--; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +octal_constant(struct str s) +{ + int start = s.size; + + if (s.size) { + if (s.text[0] == '0') { + s.text++; + s.size--; + + for (;;) { + if (octal_digit(s)) { + s.text++; + s.size--; + } else { + break; + } + } + + return(start - s.size); + } + } + + return(0); +} + +static int +hexadecimal_constant(struct str s) +{ + int start = s.size; + int hp = hexadecimal_prefix(s); + + if (hp) { + s.text += hp; + s.size -= hp; + + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + + for (;;) { + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + } else { + break; + } + } + + return(start - s.size); + } + } + + return(0); +} + +static int +integer_constant(struct str s) +{ + int sym = 0; + + if ((sym = hexadecimal_constant(s)) || (sym = octal_constant(s)) || (sym = decimal_constant(s))) { + s.text += sym; + s.size -= sym; + return(sym + integer_suffix(s)); + } + + return(0); +} + +static int +digit_sequence(struct str s) +{ + int start = s.size; + + if (digit(s)) { + s.text++; + s.size--; + + for (;;) { + if (digit(s)) { + s.text++; + s.size--; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +fractional_constant(struct str s) +{ + int ds1 = digit_sequence(s); + + s.text += ds1; + s.size -= ds1; + + if (s.size) { + if (s.text[0] == '.') { + s.text++; + s.size--; + + int ds2 = digit_sequence(s); + + s.text += ds2; + s.size -= ds2; + + if (ds1 > 0 || ds2 > 0) { + return(ds1 + ds2 + 1); + } + } + } + + return(0); +} + +static int +exponent_part(struct str s) +{ + int start = s.size; + + if (s.size) { + if (s.text[0] == 'e' || s.text[0] == 'E') { + s.text++; + s.size--; + + int sgn = sign(s); + s.text += sgn; + s.size -= sgn; + + int ds = digit_sequence(s); + if (ds) { + s.text += ds; + s.size -= ds; + return(start - s.size); + } + } + } + + return(0); +} + +static int +decimal_floating_constant(struct str s) +{ + int sym = 0; + if ((sym = fractional_constant(s))) { + s.text += sym; + s.size -= sym; + + int ep = exponent_part(s); + s.text += ep; + s.size -= ep; + + return(sym + ep + floating_suffix(s)); + } else if ((sym = digit_sequence(s))) { + s.text += sym; + s.size -= sym; + + int ep = 0; + + if ((ep = exponent_part(s))) { + s.text += ep; + s.size -= ep; + return(sym + ep + floating_suffix(s)); + } + } + + return(0); +} + +static int +hexadecimal_digit_sequence(struct str s) +{ + int start = s.size; + + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + + for (;;) { + if (hexadecimal_digit(s)) { + s.text++; + s.size--; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +hexadecimal_fractional_constant(struct str s) +{ + int hds1 = hexadecimal_digit_sequence(s); + + s.text += hds1; + s.size -= hds1; + + if (s.size && s.text[0] == '.') { + s.text++; + s.size--; + + int hds2 = hexadecimal_digit_sequence(s); + + if (hds1 > 0 || hds2 > 0) { + return(hds1 + hds2 + 1); + } + } + + return(0); +} + +static int +binary_exponent_part(struct str s) +{ + int start = s.size; + + if (s.size && (s.text[0] == 'p' || s.text[0] == 'P')) { + s.text++; + s.size--; + + int sgn = sign(s); + s.text += sgn; + s.size -= sgn; + + int ds = digit_sequence(s); + if (ds) { + s.size -= ds; + return(start - s.size); + } + } + + return(0); +} + +static int +hexadecimal_floating_constant(struct str s) +{ + int hp = 0; + int start = s.size; + + if ((hp = hexadecimal_prefix(s))) { + s.text += hp; + s.size -= hp; + + int hfc = 0; + int hds = 0; + + if ((hfc = hexadecimal_fractional_constant(s))) { + s.text += hfc; + s.size -= hfc; + int bep = binary_exponent_part(s); + if (bep) { + s.text += bep; + s.size -= bep; + return(start - s.size + floating_suffix(s)); + } + } else if ((hds = hexadecimal_digit_sequence(s))) { + s.text += hds; + s.size -= hds; + int bep = binary_exponent_part(s); + if (bep) { + s.text += bep; + s.size -= bep; + return(start - s.size + floating_suffix(s)); + } + } + } + + return(0); +} + +static int +floating_constant(struct str s) +{ + int sym = 0; + + if ((sym = decimal_floating_constant(s)) || (sym = hexadecimal_floating_constant(s))) { + return(sym); + } + + return(0); +} + +#if 0 +static int +enumeration_constant(struct str s) +{ + int i = identifier(s); + return(i); +} +#endif + +static int +escape_sequence(struct str s) +{ + int sym = 0; + + if ((sym = simple_escape_sequence(s)) || (sym = octal_escape_sequence(s)) || + (sym = hexadecimal_escape_sequence(s)) || (sym = universal_character_name(s))) { + return(sym); + } + + return(0); +} + +static int +c_char(struct str s) +{ + int sym = 0; + + if ((sym = escape_sequence(s))) { + return(sym); + } + + if (s.size) { + char c = s.text[0]; + return(c != '\'' && c != '\\' && c != '\n'); + } + + return(0); +} + +static int +h_char(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(c != '\n' && c != '>'); + } + + return(0); +} + +static int +q_char(struct str s) +{ + if (s.size) { + char c = s.text[0]; + return(c != '\n' && c != '\"'); + } + + return(0); +} + +static int +c_char_sequence(struct str s) +{ + int start = s.size; + int sc = 0; + + if ((sc = c_char(s))) { + s.text += sc; + s.size -= sc; + + for (;;) { + if ((sc = c_char(s))) { + s.text += sc; + s.size -= sc; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +character_constant(struct str s) +{ + int start = s.size; + int ok = 0; + + if (s.size && s.text[0] == '\'') { + s.text++; + s.size--; + ok = 1; + } else if (s.size >= 2 && s.text[0] == 'L' && s.text[1] == '\'') { + s.text += 2; + s.size -= 2; + ok = 1; + } else if (s.size >= 2 && s.text[0] == 'u' && s.text[1] == '\'') { + s.text += 2; + s.size -= 2; + ok = 1; + } else if (s.size >= 2 && s.text[0] == 'U' && s.text[1] == '\'') { + s.text += 2; + s.size -= 2; + ok = 1; + } + + if (ok) { + int ccs = c_char_sequence(s); + if (ccs) { + s.text += ccs; + s.size -= ccs; + if (s.size && s.text[0] == '\'') { + s.size--; + return(start - s.size); + } + } + } + + return(0); +} + +static int +constant(struct str s) +{ + int sym = 0; + + if ((sym = floating_constant(s)) || (sym = integer_constant(s)) || (sym = character_constant(s))) { + /* || (sym = enumeration_constant(s)) */ + return(sym); + } + + return(0); +} + +static int +whitespace(struct str s) +{ + int start = s.size; + + while (s.size && (s.text[0] == ' ' || s.text[0] == '\t' || s.text[0] == '\n' || s.text[0] == '\r')) { + s.text++; + s.size--; + } + + return(start - s.size); +} + +static int +s_char(struct str s) +{ + int sym = 0; + + if ((sym = escape_sequence(s))) { + return(sym); + } + + if (s.size) { + char c = s.text[0]; + return(c != '\"' && c != '\\' && c != '\n'); + } + + return(0); +} + +static int +s_char_sequence(struct str s) +{ + int start = s.size; + int sc = 0; + + if ((sc = s_char(s))) { + s.text += sc; + s.size -= sc; + + for (;;) { + if ((sc = s_char(s))) { + s.text += sc; + s.size -= sc; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +encoding_prefix(struct str s) +{ + if (s.size >= 2 && s.text[0] == 'u' && s.text[1] == '8') { + return(2); + } + + if (s.size) { + char c = s.text[0]; + if (c == 'u' || c == 'U' || c == 'L') { + return(1); + } + } + + return(0); +} + +static int +string_literal(struct str s) +{ + int start = s.size; + + int ep = encoding_prefix(s); + s.text += ep; + s.size -= ep; + + if (s.size && s.text[0] == '\"') { + s.text++; + s.size--; + + int scs = s_char_sequence(s); + s.text += scs; + s.size -= scs; + + if (s.size && s.text[0] == '\"') { + s.size--; + return(start - s.size); + } + } + + return(0); +} + +static int +punctuator(struct str s) +{ + if (s.size >= 4) { + if (s.text[0] == '%' && s.text[1] == ':' && s.text[2] == '%' && s.text[3] == ':') + { + return(4); + } + } + + if (s.size >= 3) { + char c1 = s.text[0]; + char c2 = s.text[1]; + char c3 = s.text[2]; + if ((c1 == '.' && c2 == '.' && c3 == '.') || + (c1 == '<' && c2 == '<' && c3 == '=') || + (c1 == '>' && c2 == '>' && c3 == '=')) + { + return(3); + } + } + + if (s.size >= 2) { + char c1 = s.text[0]; + char c2 = s.text[1]; + if ((c1 == '-' && c2 == '>') || (c1 == '+' && c2 == '+') || + (c1 == '-' && c2 == '-') || (c1 == '<' && c2 == '<') || + (c1 == '>' && c2 == '>') || (c1 == '<' && c2 == '=') || + (c1 == '>' && c2 == '=') || (c1 == '=' && c2 == '=') || + (c1 == '!' && c2 == '=') || (c1 == '&' && c2 == '&') || + (c1 == '|' && c2 == '|') || (c1 == '*' && c2 == '=') || + (c1 == '/' && c2 == '=') || (c1 == '%' && c2 == '=') || + (c1 == '+' && c2 == '=') || (c1 == '-' && c2 == '=') || + (c1 == '&' && c2 == '=') || (c1 == '^' && c2 == '=') || + (c1 == '|' && c2 == '=') || (c1 == '#' && c2 == '#') || + (c1 == '<' && c2 == ':') || (c1 == ':' && c2 == '>') || + (c1 == '<' && c2 == '%') || (c1 == '>' && c2 == '%') || + (c1 == '%' && c2 == ':')) + { + return(2); + } + } + + if (s.size) { + char c = s.text[0]; + if (c == '[' || c == ']' || c == '(' || c == ')' || + c == '{' || c == '}' || c == '.' || c == '&' || + c == '*' || c == '+' || c == '-' || c == '~' || + c == '!' || c == '/' || c == '%' || c == '<' || + c == '>' || c == '^' || c == '|' || c == '?' || + c == ':' || c == ';' || c == '=' || c == ',' || + c == '#') + { + return(1); + } + } + + return(0); +} + +static int +h_char_sequence(struct str s) +{ + int start = s.size; + int sc = 0; + + if ((sc = h_char(s))) { + s.text += sc; + s.size -= sc; + + for (;;) { + if ((sc = h_char(s))) { + s.text += sc; + s.size -= sc; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +q_char_sequence(struct str s) +{ + int start = s.size; + int sc = 0; + + if ((sc = q_char(s))) { + s.text += sc; + s.size -= sc; + + for (;;) { + if ((sc = q_char(s))) { + s.text += sc; + s.size -= sc; + } else { + break; + } + } + + return(start - s.size); + } + + return(0); +} + +static int +header_name(struct str s) +{ + if (s.size && s.text[0] == '<') { + s.text++; + s.size--; + + int hcs = h_char_sequence(s); + if (hcs) { + s.text += hcs; + s.size -= hcs; + + if (s.size && s.text[0] == '>') { + return(hcs + 2); + } + } + } + + if (s.size && s.text[0] == '\"') { + s.text++; + s.size--; + + int qcs = q_char_sequence(s); + if (qcs) { + s.text += qcs; + s.size -= qcs; + + if (s.size && s.text[0] == '\"') { + return(qcs + 2); + } + } + } + + return(0); +} + +static int +comment(struct str s) +{ + int start = s.size; + + if (s.size >= 2) { + if (s.text[0] == '/' && s.text[1] == '/') { + /* single-line comment */ + s.text += 2; + s.size -= 2; + + while (s.size) { + if (s.text[0] == '\n') { + s.text++; + s.size--; + break; + } + + s.text++; + s.size--; + } + + return(start - s.size); + } + + if (s.text[0] == '/' && s.text[1] == '*') { + /* multi-line comment */ + + while (s.size) { + if (s.size >= 2 && s.text[0] == '*' && s.text[1] == '/') { + s.text += 2; + s.size -= 2; + break; + } + + s.text++; + s.size--; + } + + return(start - s.size); + } + } + + return(0); +} + +static struct token * +lex(char *text, int size) +{ + struct str s = { text, size }; + + while (s.size) { + int sym = whitespace(s); + s.text += sym; + s.size -= sym; + + if ((sym = comment(s))) { + //printf("Comment: "); + } else if ((sym = constant(s))) { + //printf("Constant: "); + + +#if 0 + } else if ((sym == pp_number(s))) { + printf("PP number: "); +#endif + + } else if ((sym = punctuator(s))) { + //printf("Punctuator: "); + } else if ((sym = string_literal(s))) { + //printf("String literal: "); + } else if ((sym = header_name(s))) { + //printf("Header name: "); + } else if ((sym = identifier(s))) { + //printf("Identifier: "); + } + + //printf("%.*s\n", sym, s.text); + + if (sym) { + s.text += sym; + s.size -= sym; + } else if (s.size) { + fprintf(stderr, "Error!\n"); + break; + } + } + + + return(NULL); +} + +static void +preprocess(char *data, int size) +{ + for (int i = 0; i < size; ++i) { + if (data[i] == '\\') { + for (int j = i + 1; j < size; ++j) { + if (data[j] == '\n') { + data[j] = ' '; + data[i] = ' '; + } else if (data[j] != ' ' && data[j] != '\t') { + break; + } + } + } + } +} + +static void +run(char *data, int size) +{ + struct timespec tp = { 0 }; + + clock_gettime(CLOCK_MONOTONIC_RAW, &tp); + unsigned long long before = tp.tv_sec * 1000000ULL + tp.tv_nsec / 1000ULL; + + preprocess(data, size); + struct token *tokens = lex(data, size); + (void) tokens; + + clock_gettime(CLOCK_MONOTONIC_RAW, &tp); + + unsigned long long after = tp.tv_sec * 1000000ULL + tp.tv_nsec / 1000ULL; + + unsigned long long dt = after - before; + + fprintf(stderr, "%.2fms, %.2fMB/s\n", (float) dt / 1000, (float) size / dt); +} + +int +main(int argc, char **argv) +{ + if (argc != 2) { + fprintf(stderr, "Usage: %s input_file.c\n", argv[0]); + return(1); + } + + char *file = argv[1]; + int fd = open(file, O_RDONLY); + + if (fd == -1) { + perror("open"); + return(1); + } + + struct stat sb = { 0 }; + if (fstat(fd, &sb) == -1) { + perror("fstat"); + return(1); + } + + int size = (int) sb.st_size; + + char *data = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (data == MAP_FAILED) { + perror("mmap"); + return(1); + } + + run(data, size); + + return(0); +} \ No newline at end of file