lexer.h 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. #ifndef LEXER_H
  2. #define LEXER_H
  3. #include <stdlib.h>
  4. #include "./token.h"
  5. #include "./str.h"
  6. struct lexer {
  7. struct str in;
  8. size_t pos;
  9. size_t rpos;
  10. char c;
  11. };
  12. enum lexer_err_code {
  13. LEXER_ERR_OK = 0,
  14. LEXER_ERR_INVALID_PP_IDENT,
  15. LEXER_ERR_INVALID_PP_INCLUDE
  16. };
  17. struct lexer_err {
  18. enum lexer_err_code code;
  19. const char *name;
  20. };
  21. struct lexer lexer_create(struct str in);
  22. void lexer_read_char(struct lexer *l);
  23. char lexer_peek_char(struct lexer *l);
  24. struct token lexer_next_token(struct lexer *l, struct lexer_err *err);
  25. struct str lexer_read_ident(struct lexer *l);
  26. struct str lexer_read_str_lit(struct lexer *l);
  27. struct str lexer_read_int_lit(struct lexer *l);
  28. struct str lexer_read_until_or(struct lexer *l, char tgt, char limit,
  29. bool *reached_limit);
  30. enum token_type_enum lexer_lookup_ident(struct str ident);
  31. void lexer_skip_whitespace(struct lexer *l);
  32. struct token lexer_handle_pp(struct lexer *l, struct lexer_err *err);
  33. enum token_type_enum _lexer_lookup_pp(struct str ident);
  34. struct lexer_err lexer_err_create(enum lexer_err_code code);
  35. bool _lexer_is_letter(char c);
  36. bool _lexer_is_number(char c);
  37. void _lexer_set_err(struct lexer_err *err, enum lexer_err_code code);
  38. #if defined(IMP) || defined(LEXER_IMP)
  39. struct lexer
  40. lexer_create(struct str in)
  41. {
  42. struct lexer l = {0};
  43. l.in = in;
  44. lexer_read_char(&l);
  45. return l;
  46. }
  47. void
  48. lexer_read_char(struct lexer *l)
  49. {
  50. if ( l->rpos >= l->in.size ) {
  51. l->c = '\0';
  52. } else {
  53. l->c = l->in.data[l->rpos];
  54. }
  55. l->pos = l->rpos;
  56. ++l->rpos;
  57. }
  58. char
  59. lexer_peek_char(struct lexer *l)
  60. {
  61. if ( l->rpos >= l->in.size ) {
  62. return '\0';
  63. }
  64. return l->in.data[l->rpos];
  65. }
  66. struct token
  67. lexer_next_token(struct lexer *l, struct lexer_err *err)
  68. {
  69. #define _LEXER_CUR_CHAR str_slice(l->in, l->pos, l->pos+1)
  70. #define _LEXER_2_CHAR str_slice(l->in, l->pos, l->pos+2)
  71. struct token t = TOKEN_ILLEGAL;
  72. lexer_skip_whitespace(l);
  73. switch ( l->c ) {
  74. case '=':
  75. if ( lexer_peek_char(l) == '=' ) {
  76. t = token_create(TT_EQ, _LEXER_2_CHAR);
  77. lexer_read_char(l);
  78. goto aft_swt;
  79. }
  80. t = token_create(TT_ASSIGN, _LEXER_CUR_CHAR);
  81. break;
  82. case ';': t = token_create(TT_SEMICOLON, _LEXER_CUR_CHAR); break;
  83. case '(': t = token_create(TT_LPAREN, _LEXER_CUR_CHAR); break;
  84. case ')': t = token_create(TT_RPAREN, _LEXER_CUR_CHAR); break;
  85. case '!':
  86. if ( lexer_peek_char(l) == '=' ) {
  87. t = token_create(TT_NOT_EQ, _LEXER_2_CHAR);
  88. lexer_read_char(l);
  89. goto aft_swt;
  90. }
  91. t = token_create(TT_BANG, _LEXER_CUR_CHAR);
  92. break;
  93. case '/': t = token_create(TT_SLASH, _LEXER_CUR_CHAR); break;
  94. case ',': t = token_create(TT_COMMA, _LEXER_CUR_CHAR); break;
  95. case '*': t = token_create(TT_ASTERISK, _LEXER_CUR_CHAR); break;
  96. case '-': t = token_create(TT_DASH, _LEXER_CUR_CHAR); break;
  97. case '.': t = token_create(TT_DOT, _LEXER_CUR_CHAR); break;
  98. case '+': t = token_create(TT_PLUS, _LEXER_CUR_CHAR); break;
  99. case '{': t = token_create(TT_LBRACE, _LEXER_CUR_CHAR); break;
  100. case '}': t = token_create(TT_RBRACE, _LEXER_CUR_CHAR); break;
  101. case '<': t = token_create(TT_LABRACKET, _LEXER_CUR_CHAR); break;
  102. case '>': t = token_create(TT_RABRACKET, _LEXER_CUR_CHAR); break;
  103. case '\'': t = token_create(TT_SQUOTE, _LEXER_CUR_CHAR); break;
  104. case '#':
  105. lexer_read_char(l);
  106. if ( _lexer_is_letter(l->c) ) {
  107. t = lexer_handle_pp(l, err);
  108. if ( t.typ.code == TT_ILLEGAL ) {
  109. goto ret_invalid;
  110. }
  111. goto ret_ok;
  112. }
  113. _lexer_set_err(err, LEXER_ERR_INVALID_PP_IDENT);
  114. goto ret_invalid;
  115. break;
  116. case '"':
  117. t = token_create(TT_STR_LIT, lexer_read_str_lit(l));
  118. goto ret_ok;
  119. break;
  120. case '\0': t = token_create(TT_EOF, STR_EMPTY); break;
  121. default:
  122. if ( _lexer_is_letter(l->c) ) {
  123. struct str ident = lexer_read_ident(l);
  124. t = token_create(lexer_lookup_ident(ident), ident);
  125. goto ret_ok;
  126. }
  127. if ( _lexer_is_number(l->c) ) {
  128. t = token_create(TT_INT_LIT, lexer_read_int_lit(l));
  129. goto ret_ok;
  130. }
  131. break;
  132. }
  133. aft_swt:
  134. lexer_read_char(l);
  135. ret_ok:
  136. _lexer_set_err(err, LEXER_ERR_OK);
  137. return t;
  138. ret_invalid:
  139. return TOKEN_ILLEGAL;
  140. #undef _LEXER_2_CHAR
  141. #undef _LEXER_CUR_CHAR
  142. }
  143. struct str
  144. lexer_read_ident(struct lexer *l)
  145. {
  146. size_t pos = l->pos;
  147. while ( _lexer_is_letter(l->c) ) {
  148. lexer_read_char(l);
  149. }
  150. return str_slice(l->in, pos, l->pos);
  151. }
  152. struct str
  153. lexer_read_str_lit(struct lexer *l)
  154. {
  155. size_t pos = l->pos;
  156. lexer_read_char(l);
  157. loop:
  158. switch ( l->c ) {
  159. case '"':
  160. case '\0':
  161. break;
  162. default:
  163. lexer_read_char(l);
  164. goto loop;
  165. break;
  166. }
  167. lexer_read_char(l);
  168. return str_slice(l->in, pos+1, l->pos-1);
  169. }
  170. struct str
  171. lexer_read_int_lit(struct lexer *l)
  172. {
  173. size_t pos = l->pos;
  174. while ( _lexer_is_number(l->c) ) {
  175. lexer_read_char(l);
  176. }
  177. return str_slice(l->in, pos, l->pos);
  178. }
  179. struct str
  180. lexer_read_until_or(struct lexer *l, char tgt, char limit, bool *reached_limit)
  181. {
  182. size_t pos = l->pos;
  183. loop:
  184. if ( l->c == tgt ) {
  185. goto ret_ok;
  186. }
  187. if ( l->c == limit || l->c == '\0' ) {
  188. goto ret_err;
  189. }
  190. lexer_read_char(l);
  191. goto loop;
  192. ret_ok:
  193. if ( reached_limit != NULL ) {
  194. *reached_limit = false;
  195. }
  196. lexer_read_char(l);
  197. return str_slice(l->in, pos, l->pos-1);
  198. ret_err:
  199. if ( reached_limit != NULL ) {
  200. *reached_limit = true;
  201. }
  202. return STR_EMPTY;
  203. }
  204. enum token_type_enum
  205. lexer_lookup_ident(struct str ident)
  206. {
  207. if ( ident.size < 2 ) {
  208. return TT_IDENT;
  209. }
  210. switch ( ident.data[0] ) {
  211. case 'i':
  212. if ( str_eq_cstr(ident, "if", 2) ) {
  213. return TT_IF;
  214. }
  215. if ( str_eq_cstr(ident, "int", 3) ) {
  216. return TT_TYPE;
  217. }
  218. break;
  219. case 's':
  220. if ( str_eq_cstr(ident, "switch", 6) ) {
  221. return TT_SWITCH;
  222. }
  223. if ( str_eq_cstr(ident, "struct", 6) ) {
  224. return TT_SWITCH;
  225. }
  226. break;
  227. case 'c':
  228. if ( str_eq_cstr(ident, "char", 4) ) {
  229. return TT_TYPE;
  230. }
  231. if ( str_eq_cstr(ident, "case", 4) ) {
  232. return TT_CASE;
  233. }
  234. if ( str_eq_cstr(ident, "const", 5) ) {
  235. return TT_CONST;
  236. }
  237. if ( str_eq_cstr(ident, "continue", 8) ) {
  238. return TT_CONST;
  239. }
  240. break;
  241. case 'r':
  242. if ( str_eq_cstr(ident, "return", 6) ) {
  243. return TT_RETURN;
  244. }
  245. break;
  246. case 'e':
  247. if ( str_eq_cstr(ident, "else", 4) ) {
  248. return TT_ELSE;
  249. }
  250. if ( str_eq_cstr(ident, "enum", 4) ) {
  251. return TT_ENUM;
  252. }
  253. break;
  254. case 'f':
  255. if ( str_eq_cstr(ident, "for", 3) ) {
  256. return TT_FOR;
  257. }
  258. break;
  259. case 'b':
  260. if ( str_eq_cstr(ident, "break", 5) ) {
  261. return TT_BREAK;
  262. }
  263. break;
  264. case 'd':
  265. if ( str_eq_cstr(ident, "do", 2) ) {
  266. return TT_WHILE;
  267. }
  268. break;
  269. case 'w':
  270. if ( str_eq_cstr(ident, "while", 5) ) {
  271. return TT_WHILE;
  272. }
  273. break;
  274. }
  275. return TT_IDENT;
  276. }
  277. void
  278. lexer_skip_whitespace(struct lexer *l)
  279. {
  280. loop:
  281. switch ( l->c ) {
  282. case ' ':
  283. case '\t':
  284. case '\r':
  285. case '\n':
  286. lexer_read_char(l);
  287. goto loop;
  288. }
  289. return;
  290. }
  291. struct token
  292. lexer_handle_pp(struct lexer *l, struct lexer_err *err)
  293. {
  294. struct token t = TOKEN_ILLEGAL;
  295. struct str ident = lexer_read_ident(l);
  296. enum token_type_enum tt = _lexer_lookup_pp(ident);
  297. bool reached_limit = false;
  298. if ( tt == TT_ILLEGAL ) {
  299. _lexer_set_err(err, LEXER_ERR_INVALID_PP_IDENT);
  300. goto ret_illegal;
  301. }
  302. lexer_skip_whitespace(l);
  303. switch ( tt ) {
  304. case TT_PP_INCLUDE:
  305. switch ( l->c ) {
  306. case '<':
  307. lexer_read_char(l);
  308. ident = lexer_read_until_or(l, '>', '\n', &reached_limit);
  309. if ( reached_limit == true ) {
  310. _lexer_set_err(err, LEXER_ERR_INVALID_PP_INCLUDE);
  311. goto ret_illegal;
  312. }
  313. t = token_create(tt, ident);
  314. goto ret_ok;
  315. break;
  316. case '"':
  317. lexer_read_char(l);
  318. ident = lexer_read_until_or(l, '"', '\n', &reached_limit);
  319. if ( reached_limit == true ) {
  320. _lexer_set_err(err, LEXER_ERR_INVALID_PP_INCLUDE);
  321. goto ret_illegal;
  322. }
  323. t = token_create(tt, ident);
  324. goto ret_ok;
  325. break;
  326. default: goto ret_illegal; break;
  327. }
  328. goto ret_ok;
  329. break;
  330. case TT_PP_DEFINE:
  331. ident = lexer_read_until_or(l, '\n', '\\', &reached_limit);
  332. if ( reached_limit == true ) {
  333. goto ret_illegal;
  334. }
  335. t = token_create(tt, ident);
  336. goto ret_ok;
  337. break;
  338. case TT_PP_IFNDEF: goto ret_illegal; break;
  339. case TT_PP_IFDEF: goto ret_illegal; break;
  340. case TT_PP_IF: goto ret_illegal; break;
  341. default: goto ret_illegal; break;
  342. }
  343. ret_ok:
  344. _lexer_set_err(err, LEXER_ERR_OK);
  345. return t;
  346. ret_illegal:
  347. return TOKEN_ILLEGAL;
  348. }
  349. enum token_type_enum
  350. _lexer_lookup_pp(struct str ident)
  351. {
  352. if ( ident.size < 2 ) {
  353. return TT_ILLEGAL;
  354. }
  355. switch ( ident.size ) {
  356. case 7:
  357. if ( str_eq_cstr(ident, "include", 7) ) {
  358. return TT_PP_INCLUDE;
  359. }
  360. break;
  361. case 6:
  362. if ( str_eq_cstr(ident, "define", 6) ) {
  363. return TT_PP_DEFINE;
  364. }
  365. if ( str_eq_cstr(ident, "ifndef", 6) ) {
  366. return TT_PP_IFNDEF;
  367. }
  368. break;
  369. case 5:
  370. if ( str_eq_cstr(ident, "ifdef", 5) ) {
  371. return TT_PP_IFDEF;
  372. }
  373. break;
  374. case 2:
  375. if ( str_eq_cstr(ident, "if", 2) ) {
  376. return TT_PP_IF;
  377. }
  378. break;
  379. }
  380. return TT_ILLEGAL;
  381. }
  382. struct lexer_err
  383. lexer_err_create(enum lexer_err_code code)
  384. {
  385. #define _LEXER_ERR_CASE(it) case it: le.name = #it; break;
  386. struct lexer_err le = {0};
  387. le.code = code;
  388. switch ( code ) {
  389. _LEXER_ERR_CASE(LEXER_ERR_OK);
  390. _LEXER_ERR_CASE(LEXER_ERR_INVALID_PP_IDENT);
  391. _LEXER_ERR_CASE(LEXER_ERR_INVALID_PP_INCLUDE);
  392. }
  393. return le;
  394. #undef _LEXER_ERR_CASE
  395. }
  396. bool
  397. _lexer_is_letter(char c)
  398. {
  399. return ( c >= 0x41 && c <= 0x5A ) \
  400. || ( c >= 0x61 && c <= 0x7A ) \
  401. || c == 0x5F;
  402. }
  403. bool
  404. _lexer_is_number(char c)
  405. {
  406. return ( c >= '0' && c <= '9' );
  407. }
  408. void
  409. _lexer_set_err(struct lexer_err *err, enum lexer_err_code code)
  410. {
  411. if ( err == NULL ) {
  412. return;
  413. }
  414. *err = lexer_err_create(code);
  415. }
  416. #endif /* defined(IMP) || defined(LEXER_IMP) */
  417. #endif /* LEXER_H */