lexer.h 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. #ifndef LEXER_H
  2. #define LEXER_H
  3. #include <stdlib.h>
  4. #include "./token.h"
  5. #include "./str.h"
  6. struct lexer {
  7. struct str in;
  8. size_t pos;
  9. size_t rpos;
  10. char c;
  11. };
  12. struct lexer lexer_create(struct str in);
  13. void lexer_read_char(struct lexer *l);
  14. struct token lexer_next_token(struct lexer *l);
  15. struct str lexer_read_ident(struct lexer *l);
  16. struct str lexer_read_str_lit(struct lexer *l);
  17. struct str lexer_read_int_lit(struct lexer *l);
  18. enum token_type_enum lexer_lookup_ident(struct str ident);
  19. void lexer_skip_whitespace(struct lexer *l);
  20. bool _lexer_is_letter(char c);
  21. bool _lexer_is_number(char c);
  22. #if defined(IMP) || defined(LEXER_IMP)
  23. struct lexer
  24. lexer_create(struct str in)
  25. {
  26. struct lexer l = {0};
  27. l.in = in;
  28. lexer_read_char(&l);
  29. return l;
  30. }
  31. void
  32. lexer_read_char(struct lexer *l)
  33. {
  34. if ( l->rpos >= l->in.size ) {
  35. l->c = '\0';
  36. } else {
  37. l->c = l->in.data[l->rpos];
  38. }
  39. l->pos = l->rpos;
  40. ++l->rpos;
  41. }
  42. struct token
  43. lexer_next_token(struct lexer *l)
  44. {
  45. #define _LEXER_CUR_CHAR str_slice(l->in, l->pos, l->pos+1)
  46. struct token t = TOKEN_ILLEGAL;
  47. lexer_skip_whitespace(l);
  48. switch ( l->c ) {
  49. case '=': t = token_create(TT_ASSIGN, _LEXER_CUR_CHAR); break;
  50. case ';': t = token_create(TT_SEMICOLON, _LEXER_CUR_CHAR); break;
  51. case '(': t = token_create(TT_LPAREN, _LEXER_CUR_CHAR); break;
  52. case ')': t = token_create(TT_RPAREN, _LEXER_CUR_CHAR); break;
  53. case ',': t = token_create(TT_COMMA, _LEXER_CUR_CHAR); break;
  54. case '*': t = token_create(TT_ASTERISK, _LEXER_CUR_CHAR); break;
  55. case '-': t = token_create(TT_DASH, _LEXER_CUR_CHAR); break;
  56. case '.': t = token_create(TT_DOT, _LEXER_CUR_CHAR); break;
  57. case '+': t = token_create(TT_PLUS, _LEXER_CUR_CHAR); break;
  58. case '{': t = token_create(TT_LBRACE, _LEXER_CUR_CHAR); break;
  59. case '}': t = token_create(TT_RBRACE, _LEXER_CUR_CHAR); break;
  60. case '<': t = token_create(TT_LABRACKET, _LEXER_CUR_CHAR); break;
  61. case '>': t = token_create(TT_RABRACKET, _LEXER_CUR_CHAR); break;
  62. case '\'': t = token_create(TT_SQUOTE, _LEXER_CUR_CHAR); break;
  63. case '"':
  64. t = token_create(TT_STR_LIT, lexer_read_str_lit(l));
  65. return t;
  66. break;
  67. case '\0': t = token_create(TT_EOF, STR_EMPTY); break;
  68. default:
  69. if ( _lexer_is_letter(l->c) ) {
  70. struct str ident = lexer_read_ident(l);
  71. t = token_create(lexer_lookup_ident(ident), ident);
  72. return t;
  73. }
  74. if ( _lexer_is_number(l->c) ) {
  75. return token_create(TT_INT_LIT, lexer_read_int_lit(l));
  76. }
  77. break;
  78. }
  79. lexer_read_char(l);
  80. return t;
  81. #undef _LEXER_CUR_CHAR
  82. }
  83. struct str
  84. lexer_read_ident(struct lexer *l)
  85. {
  86. size_t pos = l->pos;
  87. while ( _lexer_is_letter(l->c) ) {
  88. lexer_read_char(l);
  89. }
  90. return str_slice(l->in, pos, l->pos);
  91. }
  92. struct str
  93. lexer_read_str_lit(struct lexer *l)
  94. {
  95. size_t pos = l->pos;
  96. lexer_read_char(l);
  97. loop:
  98. switch ( l->c ) {
  99. case '"':
  100. case '\0':
  101. break;
  102. default:
  103. lexer_read_char(l);
  104. goto loop;
  105. break;
  106. }
  107. lexer_read_char(l);
  108. return str_slice(l->in, pos+1, l->pos-1);
  109. }
  110. struct str
  111. lexer_read_int_lit(struct lexer *l)
  112. {
  113. size_t pos = l->pos;
  114. while ( _lexer_is_number(l->c) ) {
  115. lexer_read_char(l);
  116. }
  117. return str_slice(l->in, pos, l->pos);
  118. }
  119. enum token_type_enum
  120. lexer_lookup_ident(struct str ident)
  121. {
  122. if ( ident.size < 3 ) {
  123. return TT_IDENT;
  124. }
  125. switch ( ident.data[0] ) {
  126. case 'i':
  127. if ( str_eq_cstr(ident, "int", 3) ) {
  128. return TT_TYPE;
  129. }
  130. if ( str_eq_cstr(ident, "include", 7) ) {
  131. return TT_INCLUDE;
  132. }
  133. break;
  134. case 'c':
  135. if ( str_eq_cstr(ident, "char", 4) ) {
  136. return TT_TYPE;
  137. }
  138. if ( str_eq_cstr(ident, "const", 5) ) {
  139. return TT_CONST;
  140. }
  141. break;
  142. case 'r':
  143. if ( str_eq_cstr(ident, "return", 6) ) {
  144. return TT_RETURN;
  145. }
  146. break;
  147. }
  148. return TT_IDENT;
  149. }
  150. void
  151. lexer_skip_whitespace(struct lexer *l)
  152. {
  153. loop:
  154. switch ( l->c ) {
  155. case ' ':
  156. case '\t':
  157. case '\r':
  158. case '\n':
  159. lexer_read_char(l);
  160. goto loop;
  161. }
  162. return;
  163. }
  164. bool
  165. _lexer_is_letter(char c)
  166. {
  167. return ( c >= 0x41 && c <= 0x5A ) \
  168. || ( c >= 0x61 && c <= 0x7A ) \
  169. || c == 0x5F;
  170. }
  171. bool
  172. _lexer_is_number(char c)
  173. {
  174. return ( c >= '0' && c <= '9' );
  175. }
  176. #endif /* defined(IMP) || defined(LEXER_IMP) */
  177. #endif /* LEXER_H */