#ifndef LEXER_H #define LEXER_H #include #include "./token.h" #include "./str.h" struct lexer { struct str in; size_t pos; size_t rpos; char c; }; struct lexer lexer_create(struct str in); void lexer_read_char(struct lexer *l); struct token lexer_next_token(struct lexer *l); struct str lexer_read_ident(struct lexer *l); struct str lexer_read_str_lit(struct lexer *l); struct str lexer_read_int_lit(struct lexer *l); enum token_type_enum lexer_lookup_ident(struct str ident); void lexer_skip_whitespace(struct lexer *l); bool _lexer_is_letter(char c); bool _lexer_is_number(char c); #if defined(IMP) || defined(LEXER_IMP) struct lexer lexer_create(struct str in) { struct lexer l = {0}; l.in = in; lexer_read_char(&l); return l; } void lexer_read_char(struct lexer *l) { if ( l->rpos >= l->in.size ) { l->c = '\0'; } else { l->c = l->in.data[l->rpos]; } l->pos = l->rpos; ++l->rpos; } struct token lexer_next_token(struct lexer *l) { #define _LEXER_CUR_CHAR str_slice(l->in, l->pos, l->pos+1) struct token t = TOKEN_ILLEGAL; lexer_skip_whitespace(l); switch ( l->c ) { case '=': t = token_create(TT_ASSIGN, _LEXER_CUR_CHAR); break; case ';': t = token_create(TT_SEMICOLON, _LEXER_CUR_CHAR); break; case '(': t = token_create(TT_LPAREN, _LEXER_CUR_CHAR); break; case ')': t = token_create(TT_RPAREN, _LEXER_CUR_CHAR); break; case ',': t = token_create(TT_COMMA, _LEXER_CUR_CHAR); break; case '*': t = token_create(TT_ASTERISK, _LEXER_CUR_CHAR); break; case '-': t = token_create(TT_DASH, _LEXER_CUR_CHAR); break; case '.': t = token_create(TT_DOT, _LEXER_CUR_CHAR); break; case '+': t = token_create(TT_PLUS, _LEXER_CUR_CHAR); break; case '{': t = token_create(TT_LBRACE, _LEXER_CUR_CHAR); break; case '}': t = token_create(TT_RBRACE, _LEXER_CUR_CHAR); break; case '<': t = token_create(TT_LABRACKET, _LEXER_CUR_CHAR); break; case '>': t = token_create(TT_RABRACKET, _LEXER_CUR_CHAR); break; case '\'': t = token_create(TT_SQUOTE, _LEXER_CUR_CHAR); break; case '"': t = token_create(TT_STR_LIT, lexer_read_str_lit(l)); return t; break; case '\0': t = token_create(TT_EOF, STR_EMPTY); break; default: if ( _lexer_is_letter(l->c) ) { struct str ident = lexer_read_ident(l); t = token_create(lexer_lookup_ident(ident), ident); return t; } if ( _lexer_is_number(l->c) ) { return token_create(TT_INT_LIT, lexer_read_int_lit(l)); } break; } lexer_read_char(l); return t; #undef _LEXER_CUR_CHAR } struct str lexer_read_ident(struct lexer *l) { size_t pos = l->pos; while ( _lexer_is_letter(l->c) ) { lexer_read_char(l); } return str_slice(l->in, pos, l->pos); } struct str lexer_read_str_lit(struct lexer *l) { size_t pos = l->pos; lexer_read_char(l); loop: switch ( l->c ) { case '"': case '\0': break; default: lexer_read_char(l); goto loop; break; } lexer_read_char(l); return str_slice(l->in, pos+1, l->pos-1); } struct str lexer_read_int_lit(struct lexer *l) { size_t pos = l->pos; while ( _lexer_is_number(l->c) ) { lexer_read_char(l); } return str_slice(l->in, pos, l->pos); } enum token_type_enum lexer_lookup_ident(struct str ident) { if ( ident.size < 3 ) { return TT_IDENT; } switch ( ident.data[0] ) { case 'i': if ( str_eq_cstr(ident, "int", 3) ) { return TT_TYPE; } if ( str_eq_cstr(ident, "include", 7) ) { return TT_INCLUDE; } break; case 'c': if ( str_eq_cstr(ident, "char", 4) ) { return TT_TYPE; } if ( str_eq_cstr(ident, "const", 5) ) { return TT_CONST; } break; case 'r': if ( str_eq_cstr(ident, "return", 6) ) { return TT_RETURN; } break; } return TT_IDENT; } void lexer_skip_whitespace(struct lexer *l) { loop: switch ( l->c ) { case ' ': case '\t': case '\r': case '\n': lexer_read_char(l); goto loop; } return; } bool _lexer_is_letter(char c) { return ( c >= 0x41 && c <= 0x5A ) \ || ( c >= 0x61 && c <= 0x7A ) \ || c == 0x5F; } bool _lexer_is_number(char c) { return ( c >= '0' && c <= '9' ); } #endif /* defined(IMP) || defined(LEXER_IMP) */ #endif /* LEXER_H */