lexer.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. #ifndef LEXER_H
  2. #define LEXER_H
  3. #include <stdlib.h>
  4. #include "./token.h"
  5. #include "./str.h"
  6. struct lexer {
  7. struct str in;
  8. size_t pos;
  9. size_t rpos;
  10. char c;
  11. };
  12. enum lexer_err_code {
  13. LEXER_ERR_OK = 0,
  14. LEXER_ERR_INVALID_PP_IDENT
  15. };
  16. struct lexer_err {
  17. enum lexer_err_code code;
  18. const char *name;
  19. };
  20. struct lexer lexer_create(struct str in);
  21. void lexer_read_char(struct lexer *l);
  22. struct token lexer_next_token(struct lexer *l, struct lexer_err *err);
  23. struct str lexer_read_ident(struct lexer *l);
  24. struct str lexer_read_str_lit(struct lexer *l);
  25. struct str lexer_read_int_lit(struct lexer *l);
  26. enum token_type_enum lexer_lookup_ident(struct str ident);
  27. enum token_type_enum lexer_lookup_pp(struct str ident);
  28. void lexer_skip_whitespace(struct lexer *l);
  29. struct lexer_err lexer_err_create(enum lexer_err_code code);
  30. bool _lexer_is_letter(char c);
  31. bool _lexer_is_number(char c);
  32. void _lexer_set_err(struct lexer_err *err, enum lexer_err_code code);
  33. #if defined(IMP) || defined(LEXER_IMP)
  34. struct lexer
  35. lexer_create(struct str in)
  36. {
  37. struct lexer l = {0};
  38. l.in = in;
  39. lexer_read_char(&l);
  40. return l;
  41. }
  42. void
  43. lexer_read_char(struct lexer *l)
  44. {
  45. if ( l->rpos >= l->in.size ) {
  46. l->c = '\0';
  47. } else {
  48. l->c = l->in.data[l->rpos];
  49. }
  50. l->pos = l->rpos;
  51. ++l->rpos;
  52. }
  53. struct token
  54. lexer_next_token(struct lexer *l, struct lexer_err *err)
  55. {
  56. #define _LEXER_CUR_CHAR str_slice(l->in, l->pos, l->pos+1)
  57. struct token t = TOKEN_ILLEGAL;
  58. lexer_skip_whitespace(l);
  59. switch ( l->c ) {
  60. case '=': t = token_create(TT_ASSIGN, _LEXER_CUR_CHAR); break;
  61. case ';': t = token_create(TT_SEMICOLON, _LEXER_CUR_CHAR); break;
  62. case '(': t = token_create(TT_LPAREN, _LEXER_CUR_CHAR); break;
  63. case ')': t = token_create(TT_RPAREN, _LEXER_CUR_CHAR); break;
  64. case ',': t = token_create(TT_COMMA, _LEXER_CUR_CHAR); break;
  65. case '*': t = token_create(TT_ASTERISK, _LEXER_CUR_CHAR); break;
  66. case '-': t = token_create(TT_DASH, _LEXER_CUR_CHAR); break;
  67. case '.': t = token_create(TT_DOT, _LEXER_CUR_CHAR); break;
  68. case '+': t = token_create(TT_PLUS, _LEXER_CUR_CHAR); break;
  69. case '{': t = token_create(TT_LBRACE, _LEXER_CUR_CHAR); break;
  70. case '}': t = token_create(TT_RBRACE, _LEXER_CUR_CHAR); break;
  71. case '<': t = token_create(TT_LABRACKET, _LEXER_CUR_CHAR); break;
  72. case '>': t = token_create(TT_RABRACKET, _LEXER_CUR_CHAR); break;
  73. case '\'': t = token_create(TT_SQUOTE, _LEXER_CUR_CHAR); break;
  74. case '#':
  75. lexer_read_char(l);
  76. if ( _lexer_is_letter(l->c) ) {
  77. struct str ident = lexer_read_ident(l);
  78. t = token_create(lexer_lookup_pp(ident), ident);
  79. if ( t.typ.code == TT_ILLEGAL ) {
  80. goto invalid_pp;
  81. }
  82. return t;
  83. }
  84. invalid_pp:
  85. _lexer_set_err(err, LEXER_ERR_INVALID_PP_IDENT);
  86. return TOKEN_ILLEGAL;
  87. break;
  88. case '"':
  89. t = token_create(TT_STR_LIT, lexer_read_str_lit(l));
  90. return t;
  91. break;
  92. case '\0': t = token_create(TT_EOF, STR_EMPTY); break;
  93. default:
  94. if ( _lexer_is_letter(l->c) ) {
  95. struct str ident = lexer_read_ident(l);
  96. t = token_create(lexer_lookup_ident(ident), ident);
  97. return t;
  98. }
  99. if ( _lexer_is_number(l->c) ) {
  100. return token_create(TT_INT_LIT, lexer_read_int_lit(l));
  101. }
  102. break;
  103. }
  104. lexer_read_char(l);
  105. return t;
  106. #undef _LEXER_CUR_CHAR
  107. }
  108. struct str
  109. lexer_read_ident(struct lexer *l)
  110. {
  111. size_t pos = l->pos;
  112. while ( _lexer_is_letter(l->c) ) {
  113. lexer_read_char(l);
  114. }
  115. return str_slice(l->in, pos, l->pos);
  116. }
  117. struct str
  118. lexer_read_str_lit(struct lexer *l)
  119. {
  120. size_t pos = l->pos;
  121. lexer_read_char(l);
  122. loop:
  123. switch ( l->c ) {
  124. case '"':
  125. case '\0':
  126. break;
  127. default:
  128. lexer_read_char(l);
  129. goto loop;
  130. break;
  131. }
  132. lexer_read_char(l);
  133. return str_slice(l->in, pos+1, l->pos-1);
  134. }
  135. struct str
  136. lexer_read_int_lit(struct lexer *l)
  137. {
  138. size_t pos = l->pos;
  139. while ( _lexer_is_number(l->c) ) {
  140. lexer_read_char(l);
  141. }
  142. return str_slice(l->in, pos, l->pos);
  143. }
  144. enum token_type_enum
  145. lexer_lookup_ident(struct str ident)
  146. {
  147. if ( ident.size < 3 ) {
  148. return TT_IDENT;
  149. }
  150. switch ( ident.data[0] ) {
  151. case 'i':
  152. if ( str_eq_cstr(ident, "int", 3) ) {
  153. return TT_TYPE;
  154. }
  155. break;
  156. case 'c':
  157. if ( str_eq_cstr(ident, "char", 4) ) {
  158. return TT_TYPE;
  159. }
  160. if ( str_eq_cstr(ident, "const", 5) ) {
  161. return TT_CONST;
  162. }
  163. break;
  164. case 'r':
  165. if ( str_eq_cstr(ident, "return", 6) ) {
  166. return TT_RETURN;
  167. }
  168. break;
  169. }
  170. return TT_IDENT;
  171. }
  172. enum token_type_enum
  173. lexer_lookup_pp(struct str ident)
  174. {
  175. if ( ident.size < 2 ) {
  176. return TT_ILLEGAL;
  177. }
  178. switch ( ident.size ) {
  179. case 7:
  180. if ( str_eq_cstr(ident, "include", 7) ) {
  181. return TT_PP_INCLUDE;
  182. }
  183. break;
  184. case 6:
  185. if ( str_eq_cstr(ident, "define", 6) ) {
  186. return TT_PP_DEFINE;
  187. }
  188. if ( str_eq_cstr(ident, "ifndef", 6) ) {
  189. return TT_PP_IFNDEF;
  190. }
  191. break;
  192. case 5:
  193. if ( str_eq_cstr(ident, "ifdef", 5) ) {
  194. return TT_PP_IFDEF;
  195. }
  196. break;
  197. case 2:
  198. if ( str_eq_cstr(ident, "if", 2) ) {
  199. return TT_PP_IF;
  200. }
  201. break;
  202. }
  203. return TT_ILLEGAL;
  204. }
  205. void
  206. lexer_skip_whitespace(struct lexer *l)
  207. {
  208. loop:
  209. switch ( l->c ) {
  210. case ' ':
  211. case '\t':
  212. case '\r':
  213. case '\n':
  214. lexer_read_char(l);
  215. goto loop;
  216. }
  217. return;
  218. }
  219. struct lexer_err
  220. lexer_err_create(enum lexer_err_code code)
  221. {
  222. #define _LEXER_ERR_CASE(it) case it: le.name = #it; break;
  223. struct lexer_err le = {0};
  224. le.code = code;
  225. switch ( code ) {
  226. _LEXER_ERR_CASE(LEXER_ERR_OK);
  227. _LEXER_ERR_CASE(LEXER_ERR_INVALID_PP_IDENT);
  228. }
  229. return le;
  230. #undef _LEXER_ERR_CASE
  231. }
  232. bool
  233. _lexer_is_letter(char c)
  234. {
  235. return ( c >= 0x41 && c <= 0x5A ) \
  236. || ( c >= 0x61 && c <= 0x7A ) \
  237. || c == 0x5F;
  238. }
  239. bool
  240. _lexer_is_number(char c)
  241. {
  242. return ( c >= '0' && c <= '9' );
  243. }
  244. void
  245. _lexer_set_err(struct lexer_err *err, enum lexer_err_code code)
  246. {
  247. if ( err == NULL ) {
  248. return;
  249. }
  250. *err = lexer_err_create(code);
  251. }
  252. #endif /* defined(IMP) || defined(LEXER_IMP) */
  253. #endif /* LEXER_H */