| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298 |
- #ifndef LEXER_H
- #define LEXER_H
- #include <stdlib.h>
- #include "./token.h"
- #include "./str.h"
- struct lexer {
- struct str in;
- size_t pos;
- size_t rpos;
- char c;
- };
- enum lexer_err_code {
- LEXER_ERR_OK = 0,
- LEXER_ERR_INVALID_PP_IDENT
- };
- struct lexer_err {
- enum lexer_err_code code;
- const char *name;
- };
- struct lexer lexer_create(struct str in);
- void lexer_read_char(struct lexer *l);
- struct token lexer_next_token(struct lexer *l, struct lexer_err *err);
- struct str lexer_read_ident(struct lexer *l);
- struct str lexer_read_str_lit(struct lexer *l);
- struct str lexer_read_int_lit(struct lexer *l);
- enum token_type_enum lexer_lookup_ident(struct str ident);
- enum token_type_enum lexer_lookup_pp(struct str ident);
- void lexer_skip_whitespace(struct lexer *l);
- struct lexer_err lexer_err_create(enum lexer_err_code code);
- bool _lexer_is_letter(char c);
- bool _lexer_is_number(char c);
- void _lexer_set_err(struct lexer_err *err, enum lexer_err_code code);
- #if defined(IMP) || defined(LEXER_IMP)
- struct lexer
- lexer_create(struct str in)
- {
- struct lexer l = {0};
- l.in = in;
- lexer_read_char(&l);
- return l;
- }
- void
- lexer_read_char(struct lexer *l)
- {
- if ( l->rpos >= l->in.size ) {
- l->c = '\0';
- } else {
- l->c = l->in.data[l->rpos];
- }
- l->pos = l->rpos;
- ++l->rpos;
- }
- struct token
- lexer_next_token(struct lexer *l, struct lexer_err *err)
- {
- #define _LEXER_CUR_CHAR str_slice(l->in, l->pos, l->pos+1)
- struct token t = TOKEN_ILLEGAL;
- lexer_skip_whitespace(l);
- switch ( l->c ) {
- case '=': t = token_create(TT_ASSIGN, _LEXER_CUR_CHAR); break;
- case ';': t = token_create(TT_SEMICOLON, _LEXER_CUR_CHAR); break;
- case '(': t = token_create(TT_LPAREN, _LEXER_CUR_CHAR); break;
- case ')': t = token_create(TT_RPAREN, _LEXER_CUR_CHAR); break;
- case ',': t = token_create(TT_COMMA, _LEXER_CUR_CHAR); break;
- case '*': t = token_create(TT_ASTERISK, _LEXER_CUR_CHAR); break;
- case '-': t = token_create(TT_DASH, _LEXER_CUR_CHAR); break;
- case '.': t = token_create(TT_DOT, _LEXER_CUR_CHAR); break;
- case '+': t = token_create(TT_PLUS, _LEXER_CUR_CHAR); break;
- case '{': t = token_create(TT_LBRACE, _LEXER_CUR_CHAR); break;
- case '}': t = token_create(TT_RBRACE, _LEXER_CUR_CHAR); break;
- case '<': t = token_create(TT_LABRACKET, _LEXER_CUR_CHAR); break;
- case '>': t = token_create(TT_RABRACKET, _LEXER_CUR_CHAR); break;
- case '\'': t = token_create(TT_SQUOTE, _LEXER_CUR_CHAR); break;
- case '#':
- lexer_read_char(l);
- if ( _lexer_is_letter(l->c) ) {
- struct str ident = lexer_read_ident(l);
- t = token_create(lexer_lookup_pp(ident), ident);
- if ( t.typ.code == TT_ILLEGAL ) {
- goto invalid_pp;
- }
- return t;
- }
- invalid_pp:
- _lexer_set_err(err, LEXER_ERR_INVALID_PP_IDENT);
- return TOKEN_ILLEGAL;
- break;
- case '"':
- t = token_create(TT_STR_LIT, lexer_read_str_lit(l));
- return t;
- break;
- case '\0': t = token_create(TT_EOF, STR_EMPTY); break;
- default:
- if ( _lexer_is_letter(l->c) ) {
- struct str ident = lexer_read_ident(l);
- t = token_create(lexer_lookup_ident(ident), ident);
- return t;
- }
- if ( _lexer_is_number(l->c) ) {
- return token_create(TT_INT_LIT, lexer_read_int_lit(l));
- }
- break;
- }
- lexer_read_char(l);
- return t;
- #undef _LEXER_CUR_CHAR
- }
- struct str
- lexer_read_ident(struct lexer *l)
- {
- size_t pos = l->pos;
- while ( _lexer_is_letter(l->c) ) {
- lexer_read_char(l);
- }
- return str_slice(l->in, pos, l->pos);
- }
- struct str
- lexer_read_str_lit(struct lexer *l)
- {
- size_t pos = l->pos;
- lexer_read_char(l);
- loop:
- switch ( l->c ) {
- case '"':
- case '\0':
- break;
- default:
- lexer_read_char(l);
- goto loop;
- break;
- }
- lexer_read_char(l);
- return str_slice(l->in, pos+1, l->pos-1);
- }
- struct str
- lexer_read_int_lit(struct lexer *l)
- {
- size_t pos = l->pos;
- while ( _lexer_is_number(l->c) ) {
- lexer_read_char(l);
- }
- return str_slice(l->in, pos, l->pos);
-
- }
- enum token_type_enum
- lexer_lookup_ident(struct str ident)
- {
- if ( ident.size < 3 ) {
- return TT_IDENT;
- }
- switch ( ident.data[0] ) {
- case 'i':
- if ( str_eq_cstr(ident, "int", 3) ) {
- return TT_TYPE;
- }
- break;
- case 'c':
- if ( str_eq_cstr(ident, "char", 4) ) {
- return TT_TYPE;
- }
- if ( str_eq_cstr(ident, "const", 5) ) {
- return TT_CONST;
- }
- break;
- case 'r':
- if ( str_eq_cstr(ident, "return", 6) ) {
- return TT_RETURN;
- }
- break;
- }
- return TT_IDENT;
- }
- enum token_type_enum
- lexer_lookup_pp(struct str ident)
- {
- if ( ident.size < 2 ) {
- return TT_ILLEGAL;
- }
- switch ( ident.size ) {
- case 7:
- if ( str_eq_cstr(ident, "include", 7) ) {
- return TT_PP_INCLUDE;
- }
- break;
- case 6:
- if ( str_eq_cstr(ident, "define", 6) ) {
- return TT_PP_DEFINE;
- }
- if ( str_eq_cstr(ident, "ifndef", 6) ) {
- return TT_PP_IFNDEF;
- }
- break;
- case 5:
- if ( str_eq_cstr(ident, "ifdef", 5) ) {
- return TT_PP_IFDEF;
- }
- break;
- case 2:
- if ( str_eq_cstr(ident, "if", 2) ) {
- return TT_PP_IF;
- }
- break;
- }
- return TT_ILLEGAL;
- }
- void
- lexer_skip_whitespace(struct lexer *l)
- {
- loop:
- switch ( l->c ) {
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- lexer_read_char(l);
- goto loop;
- }
- return;
- }
- struct lexer_err
- lexer_err_create(enum lexer_err_code code)
- {
- #define _LEXER_ERR_CASE(it) case it: le.name = #it; break;
- struct lexer_err le = {0};
- le.code = code;
- switch ( code ) {
- _LEXER_ERR_CASE(LEXER_ERR_OK);
- _LEXER_ERR_CASE(LEXER_ERR_INVALID_PP_IDENT);
- }
- return le;
- #undef _LEXER_ERR_CASE
- }
- bool
- _lexer_is_letter(char c)
- {
- return ( c >= 0x41 && c <= 0x5A ) \
- || ( c >= 0x61 && c <= 0x7A ) \
- || c == 0x5F;
- }
- bool
- _lexer_is_number(char c)
- {
- return ( c >= '0' && c <= '9' );
- }
- void
- _lexer_set_err(struct lexer_err *err, enum lexer_err_code code)
- {
- if ( err == NULL ) {
- return;
- }
- *err = lexer_err_create(code);
- }
- #endif /* defined(IMP) || defined(LEXER_IMP) */
- #endif /* LEXER_H */
|