| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476 |
- #ifndef LEXER_H
- #define LEXER_H
- #include <stdlib.h>
- #include "./token.h"
- #include "./str.h"
- struct lexer {
- struct str in;
- size_t pos;
- size_t rpos;
- char c;
- };
- enum lexer_err_code {
- LEXER_ERR_OK = 0,
- LEXER_ERR_INVALID_PP_IDENT,
- LEXER_ERR_INVALID_PP_INCLUDE
- };
- struct lexer_err {
- enum lexer_err_code code;
- const char *name;
- };
- struct lexer lexer_create(struct str in);
- void lexer_read_char(struct lexer *l);
- char lexer_peek_char(struct lexer *l);
- struct token lexer_next_token(struct lexer *l, struct lexer_err *err);
- struct str lexer_read_ident(struct lexer *l);
- struct str lexer_read_str_lit(struct lexer *l);
- struct str lexer_read_int_lit(struct lexer *l);
- struct str lexer_read_until_or(struct lexer *l, char tgt, char limit,
- bool *reached_limit);
- enum token_type_enum lexer_lookup_ident(struct str ident);
- void lexer_skip_whitespace(struct lexer *l);
- struct token lexer_handle_pp(struct lexer *l, struct lexer_err *err);
- enum token_type_enum _lexer_lookup_pp(struct str ident);
- struct lexer_err lexer_err_create(enum lexer_err_code code);
- bool _lexer_is_letter(char c);
- bool _lexer_is_number(char c);
- void _lexer_set_err(struct lexer_err *err, enum lexer_err_code code);
- #if defined(IMP) || defined(LEXER_IMP)
- struct lexer
- lexer_create(struct str in)
- {
- struct lexer l = {0};
- l.in = in;
- lexer_read_char(&l);
- return l;
- }
- void
- lexer_read_char(struct lexer *l)
- {
- if ( l->rpos >= l->in.size ) {
- l->c = '\0';
- } else {
- l->c = l->in.data[l->rpos];
- }
- l->pos = l->rpos;
- ++l->rpos;
- }
- char
- lexer_peek_char(struct lexer *l)
- {
- if ( l->rpos >= l->in.size ) {
- return '\0';
- }
- return l->in.data[l->rpos];
- }
- struct token
- lexer_next_token(struct lexer *l, struct lexer_err *err)
- {
- #define _LEXER_CUR_CHAR str_slice(l->in, l->pos, l->pos+1)
- #define _LEXER_2_CHAR str_slice(l->in, l->pos, l->pos+2)
- struct token t = TOKEN_ILLEGAL;
- lexer_skip_whitespace(l);
- switch ( l->c ) {
- case '=':
- if ( lexer_peek_char(l) == '=' ) {
- t = token_create(TT_EQ, _LEXER_2_CHAR);
- lexer_read_char(l);
- goto aft_swt;
- }
- t = token_create(TT_ASSIGN, _LEXER_CUR_CHAR);
- break;
- case ';': t = token_create(TT_SEMICOLON, _LEXER_CUR_CHAR); break;
- case '(': t = token_create(TT_LPAREN, _LEXER_CUR_CHAR); break;
- case ')': t = token_create(TT_RPAREN, _LEXER_CUR_CHAR); break;
- case '!':
- if ( lexer_peek_char(l) == '=' ) {
- t = token_create(TT_NOT_EQ, _LEXER_2_CHAR);
- lexer_read_char(l);
- goto aft_swt;
- }
- t = token_create(TT_BANG, _LEXER_CUR_CHAR);
- break;
- case '/': t = token_create(TT_SLASH, _LEXER_CUR_CHAR); break;
- case ',': t = token_create(TT_COMMA, _LEXER_CUR_CHAR); break;
- case '*': t = token_create(TT_ASTERISK, _LEXER_CUR_CHAR); break;
- case '-': t = token_create(TT_DASH, _LEXER_CUR_CHAR); break;
- case '.': t = token_create(TT_DOT, _LEXER_CUR_CHAR); break;
- case '+': t = token_create(TT_PLUS, _LEXER_CUR_CHAR); break;
- case '{': t = token_create(TT_LBRACE, _LEXER_CUR_CHAR); break;
- case '}': t = token_create(TT_RBRACE, _LEXER_CUR_CHAR); break;
- case '<': t = token_create(TT_LABRACKET, _LEXER_CUR_CHAR); break;
- case '>': t = token_create(TT_RABRACKET, _LEXER_CUR_CHAR); break;
- case '\'': t = token_create(TT_SQUOTE, _LEXER_CUR_CHAR); break;
- case '#':
- lexer_read_char(l);
- if ( _lexer_is_letter(l->c) ) {
- t = lexer_handle_pp(l, err);
- if ( t.typ.code == TT_ILLEGAL ) {
- goto ret_invalid;
- }
- goto ret_ok;
- }
- _lexer_set_err(err, LEXER_ERR_INVALID_PP_IDENT);
- goto ret_invalid;
- break;
- case '"':
- t = token_create(TT_STR_LIT, lexer_read_str_lit(l));
- goto ret_ok;
- break;
- case '\0': t = token_create(TT_EOF, STR_EMPTY); break;
- default:
- if ( _lexer_is_letter(l->c) ) {
- struct str ident = lexer_read_ident(l);
- t = token_create(lexer_lookup_ident(ident), ident);
- goto ret_ok;
- }
- if ( _lexer_is_number(l->c) ) {
- t = token_create(TT_INT_LIT, lexer_read_int_lit(l));
- goto ret_ok;
- }
- break;
- }
- aft_swt:
- lexer_read_char(l);
- ret_ok:
- _lexer_set_err(err, LEXER_ERR_OK);
- return t;
- ret_invalid:
- return TOKEN_ILLEGAL;
- #undef _LEXER_2_CHAR
- #undef _LEXER_CUR_CHAR
- }
- struct str
- lexer_read_ident(struct lexer *l)
- {
- size_t pos = l->pos;
- while ( _lexer_is_letter(l->c) ) {
- lexer_read_char(l);
- }
- return str_slice(l->in, pos, l->pos);
- }
- struct str
- lexer_read_str_lit(struct lexer *l)
- {
- size_t pos = l->pos;
- lexer_read_char(l);
- loop:
- switch ( l->c ) {
- case '"':
- case '\0':
- break;
- default:
- lexer_read_char(l);
- goto loop;
- break;
- }
- lexer_read_char(l);
- return str_slice(l->in, pos+1, l->pos-1);
- }
- struct str
- lexer_read_int_lit(struct lexer *l)
- {
- size_t pos = l->pos;
- while ( _lexer_is_number(l->c) ) {
- lexer_read_char(l);
- }
- return str_slice(l->in, pos, l->pos);
-
- }
- struct str
- lexer_read_until_or(struct lexer *l, char tgt, char limit, bool *reached_limit)
- {
- size_t pos = l->pos;
- loop:
- if ( l->c == tgt ) {
- goto ret_ok;
- }
- if ( l->c == limit || l->c == '\0' ) {
- goto ret_err;
- }
- lexer_read_char(l);
- goto loop;
- ret_ok:
- if ( reached_limit != NULL ) {
- *reached_limit = false;
- }
- lexer_read_char(l);
- return str_slice(l->in, pos, l->pos-1);
- ret_err:
- if ( reached_limit != NULL ) {
- *reached_limit = true;
- }
- return STR_EMPTY;
- }
- enum token_type_enum
- lexer_lookup_ident(struct str ident)
- {
- if ( ident.size < 2 ) {
- return TT_IDENT;
- }
- switch ( ident.data[0] ) {
- case 'i':
- if ( str_eq_cstr(ident, "if", 2) ) {
- return TT_IF;
- }
- if ( str_eq_cstr(ident, "int", 3) ) {
- return TT_TYPE;
- }
- break;
- case 's':
- if ( str_eq_cstr(ident, "switch", 6) ) {
- return TT_SWITCH;
- }
- if ( str_eq_cstr(ident, "struct", 6) ) {
- return TT_SWITCH;
- }
- break;
- case 'c':
- if ( str_eq_cstr(ident, "char", 4) ) {
- return TT_TYPE;
- }
- if ( str_eq_cstr(ident, "case", 4) ) {
- return TT_CASE;
- }
- if ( str_eq_cstr(ident, "const", 5) ) {
- return TT_CONST;
- }
- if ( str_eq_cstr(ident, "continue", 8) ) {
- return TT_CONST;
- }
- break;
- case 'r':
- if ( str_eq_cstr(ident, "return", 6) ) {
- return TT_RETURN;
- }
- break;
- case 'e':
- if ( str_eq_cstr(ident, "else", 4) ) {
- return TT_ELSE;
- }
- if ( str_eq_cstr(ident, "enum", 4) ) {
- return TT_ENUM;
- }
- break;
- case 'f':
- if ( str_eq_cstr(ident, "for", 3) ) {
- return TT_FOR;
- }
- break;
- case 'b':
- if ( str_eq_cstr(ident, "break", 5) ) {
- return TT_BREAK;
- }
- break;
- case 'd':
- if ( str_eq_cstr(ident, "do", 2) ) {
- return TT_WHILE;
- }
- break;
- case 'w':
- if ( str_eq_cstr(ident, "while", 5) ) {
- return TT_WHILE;
- }
- break;
- }
- return TT_IDENT;
- }
- void
- lexer_skip_whitespace(struct lexer *l)
- {
- loop:
- switch ( l->c ) {
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- lexer_read_char(l);
- goto loop;
- }
- return;
- }
- struct token
- lexer_handle_pp(struct lexer *l, struct lexer_err *err)
- {
- struct token t = TOKEN_ILLEGAL;
- struct str ident = lexer_read_ident(l);
- enum token_type_enum tt = _lexer_lookup_pp(ident);
- bool reached_limit = false;
- if ( tt == TT_ILLEGAL ) {
- _lexer_set_err(err, LEXER_ERR_INVALID_PP_IDENT);
- goto ret_illegal;
- }
- lexer_skip_whitespace(l);
- switch ( tt ) {
- case TT_PP_INCLUDE:
- switch ( l->c ) {
- case '<':
- lexer_read_char(l);
- ident = lexer_read_until_or(l, '>', '\n', &reached_limit);
- if ( reached_limit == true ) {
- _lexer_set_err(err, LEXER_ERR_INVALID_PP_INCLUDE);
- goto ret_illegal;
- }
- t = token_create(tt, ident);
- goto ret_ok;
- break;
- case '"':
- lexer_read_char(l);
- ident = lexer_read_until_or(l, '"', '\n', &reached_limit);
- if ( reached_limit == true ) {
- _lexer_set_err(err, LEXER_ERR_INVALID_PP_INCLUDE);
- goto ret_illegal;
- }
- t = token_create(tt, ident);
- goto ret_ok;
- break;
- default: goto ret_illegal; break;
- }
- goto ret_ok;
- break;
- case TT_PP_DEFINE:
- ident = lexer_read_until_or(l, '\n', '\\', &reached_limit);
- if ( reached_limit == true ) {
- goto ret_illegal;
- }
- t = token_create(tt, ident);
- goto ret_ok;
- break;
- case TT_PP_IFNDEF: goto ret_illegal; break;
- case TT_PP_IFDEF: goto ret_illegal; break;
- case TT_PP_IF: goto ret_illegal; break;
- default: goto ret_illegal; break;
- }
- ret_ok:
- _lexer_set_err(err, LEXER_ERR_OK);
- return t;
- ret_illegal:
- return TOKEN_ILLEGAL;
- }
- enum token_type_enum
- _lexer_lookup_pp(struct str ident)
- {
- if ( ident.size < 2 ) {
- return TT_ILLEGAL;
- }
- switch ( ident.size ) {
- case 7:
- if ( str_eq_cstr(ident, "include", 7) ) {
- return TT_PP_INCLUDE;
- }
- break;
- case 6:
- if ( str_eq_cstr(ident, "define", 6) ) {
- return TT_PP_DEFINE;
- }
- if ( str_eq_cstr(ident, "ifndef", 6) ) {
- return TT_PP_IFNDEF;
- }
- break;
- case 5:
- if ( str_eq_cstr(ident, "ifdef", 5) ) {
- return TT_PP_IFDEF;
- }
- break;
- case 2:
- if ( str_eq_cstr(ident, "if", 2) ) {
- return TT_PP_IF;
- }
- break;
- }
- return TT_ILLEGAL;
- }
- struct lexer_err
- lexer_err_create(enum lexer_err_code code)
- {
- #define _LEXER_ERR_CASE(it) case it: le.name = #it; break;
- struct lexer_err le = {0};
- le.code = code;
- switch ( code ) {
- _LEXER_ERR_CASE(LEXER_ERR_OK);
- _LEXER_ERR_CASE(LEXER_ERR_INVALID_PP_IDENT);
- _LEXER_ERR_CASE(LEXER_ERR_INVALID_PP_INCLUDE);
- }
- return le;
- #undef _LEXER_ERR_CASE
- }
- bool
- _lexer_is_letter(char c)
- {
- return ( c >= 0x41 && c <= 0x5A ) \
- || ( c >= 0x61 && c <= 0x7A ) \
- || c == 0x5F;
- }
- bool
- _lexer_is_number(char c)
- {
- return ( c >= '0' && c <= '9' );
- }
- void
- _lexer_set_err(struct lexer_err *err, enum lexer_err_code code)
- {
- if ( err == NULL ) {
- return;
- }
- *err = lexer_err_create(code);
- }
- #endif /* defined(IMP) || defined(LEXER_IMP) */
- #endif /* LEXER_H */
|