1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.mal_lang.lib;
18
19 import java.io.File;
20 import java.io.IOException;
21 import java.nio.charset.StandardCharsets;
22 import java.nio.file.Files;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Locale;
27 import java.util.Map;
28
29 public class Lexer {
30 private MalLogger LOGGER;
31 private String filename;
32 private byte[] input;
33 private int index;
34 private int line;
35 private int col;
36 private int startLine;
37 private int startCol;
38 private List<Byte> lexeme;
39 private List<Token> comments = new ArrayList<>();
40 private boolean eof;
41
42 private static Map<String, TokenType> keywords;
43
44 static {
45 keywords = new HashMap<>();
46 keywords.put("include", TokenType.INCLUDE);
47 keywords.put("info", TokenType.INFO);
48 keywords.put("category", TokenType.CATEGORY);
49 keywords.put("abstract", TokenType.ABSTRACT);
50 keywords.put("asset", TokenType.ASSET);
51 keywords.put("extends", TokenType.EXTENDS);
52 keywords.put("associations", TokenType.ASSOCIATIONS);
53 keywords.put("let", TokenType.LET);
54 keywords.put("E", TokenType.EXIST);
55 keywords.put("C", TokenType.C);
56 keywords.put("I", TokenType.I);
57 keywords.put("A", TokenType.A);
58 }
59
60 private static Map<String, Byte> escapeSequences;
61
62 static {
63 escapeSequences = new HashMap<>();
64 escapeSequences.put("\\b", (byte) '\b');
65 escapeSequences.put("\\n", (byte) '\n');
66 escapeSequences.put("\\t", (byte) '\t');
67 escapeSequences.put("\\r", (byte) '\r');
68 escapeSequences.put("\\f", (byte) '\f');
69 escapeSequences.put("\\\"", (byte) '"');
70 escapeSequences.put("\\\\", (byte) '\\');
71 }
72
73 public Lexer(File file) throws IOException {
74 this(file, file.getName(), false, false);
75 }
76
77 public Lexer(File file, boolean verbose, boolean debug) throws IOException {
78 this(file, file.getName(), verbose, debug);
79 }
80
81 public Lexer(File file, String relativeName) throws IOException {
82 this(file, relativeName, false, false);
83 }
84
85 public Lexer(File file, String relativeName, boolean verbose, boolean debug) throws IOException {
86 Locale.setDefault(Locale.ROOT);
87 LOGGER = new MalLogger("LEXER", verbose, debug);
88 try {
89 LOGGER.debug(String.format("Creating lexer with file '%s'", relativeName));
90 if (!file.exists()) {
91 throw new IOException(String.format("%s: No such file or directory", relativeName));
92 }
93 this.filename = relativeName;
94 this.input = Files.readAllBytes(file.toPath());
95 this.index = 0;
96 this.line = 1;
97 this.col = 1;
98 this.eof = input.length == 0;
99 } catch (IOException e) {
100 LOGGER.print();
101 throw e;
102 }
103 }
104
105 public static boolean syntacticallyEqual(Lexer l1, Lexer l2) {
106 try {
107 var tok1 = l1.next();
108 var tok2 = l2.next();
109 while (tok1.type != TokenType.EOF && tok2.type != TokenType.EOF) {
110 if (tok1.type != tok2.type
111 || !tok1.stringValue.equals(tok2.stringValue)
112 || tok1.intValue != tok2.intValue
113 || tok1.doubleValue != tok2.doubleValue) {
114 return false;
115 }
116 tok1 = l1.next();
117 tok2 = l2.next();
118 }
119 return tok1.type == TokenType.EOF && tok2.type == TokenType.EOF;
120 } catch (CompilerException e) {
121 return false;
122 }
123 }
124
125 private String getLexemeString() {
126 byte[] byteArray = new byte[lexeme.size()];
127 for (int i = 0; i < lexeme.size(); i++) {
128 byteArray[i] = lexeme.get(i).byteValue();
129 }
130 return new String(byteArray, StandardCharsets.UTF_8);
131 }
132
133 public Token next() throws CompilerException {
134 startLine = line;
135 startCol = col;
136 lexeme = new ArrayList<>();
137 if (eof) {
138 LOGGER.print();
139 return createToken(TokenType.EOF);
140 }
141 byte c = consume();
142 switch (c) {
143 case ' ':
144 case '\t':
145 case '\r':
146 case '\n':
147 return next();
148 case '#':
149 return createToken(TokenType.HASH);
150 case ':':
151 return createToken(TokenType.COLON);
152 case '{':
153 return createToken(TokenType.LCURLY);
154 case '}':
155 return createToken(TokenType.RCURLY);
156 case '+':
157 if (peek('>')) {
158 consume();
159 return createToken(TokenType.INHERIT);
160 } else {
161 return createToken(TokenType.PLUS);
162 }
163 case '-':
164 if (peek('>')) {
165 consume();
166 return createToken(TokenType.OVERRIDE);
167 } else if (peek("->")) {
168 consume(2);
169 return createToken(TokenType.RARROW);
170 } else {
171 return createToken(TokenType.MINUS);
172 }
173 case '&':
174 return createToken(TokenType.ALL);
175 case '|':
176 return createToken(TokenType.ANY);
177 case '!':
178 if (peek('E')) {
179 consume();
180 return createToken(TokenType.NOTEXIST);
181 } else {
182 throw exception("Expected 'E'");
183 }
184 case '@':
185 return createToken(TokenType.AT);
186 case '[':
187 return createToken(TokenType.LBRACKET);
188 case ']':
189 return createToken(TokenType.RBRACKET);
190 case '(':
191 return createToken(TokenType.LPAREN);
192 case ')':
193 return createToken(TokenType.RPAREN);
194 case ',':
195 return createToken(TokenType.COMMA);
196 case '<':
197 if (peek("--")) {
198 consume(2);
199 return createToken(TokenType.LARROW);
200 } else if (peek('-')) {
201 consume();
202 return createToken(TokenType.REQUIRE);
203 } else {
204 throw exception("Expected '-' or '--'");
205 }
206 case '=':
207 return createToken(TokenType.ASSIGN);
208 case '\\':
209 if (peek('/')) {
210 consume();
211 return createToken(TokenType.UNION);
212 } else {
213 throw exception("Expected '/'");
214 }
215 case '/':
216 if (peek('\\')) {
217 consume();
218 return createToken(TokenType.INTERSECT);
219 } else if (peek('/')) {
220 while (!eof && !peek('\n') && !peek('\r')) {
221 consume();
222 }
223 createComment(TokenType.SINGLECOMMENT);
224 return next();
225 } else if (peek('*')) {
226 consume();
227 while (!peek("*/")) {
228 if (eof) {
229 throw exception(
230 String.format(
231 "Unterminated comment starting at %s",
232 new Position(filename, startLine, startCol)));
233 }
234 consume();
235 }
236 consume(2);
237 createComment(TokenType.MULTICOMMENT);
238 return next();
239 } else {
240 return createToken(TokenType.DIVIDE);
241 }
242 case '.':
243 if (peek('.')) {
244 consume();
245 return createToken(TokenType.RANGE);
246 } else {
247 return createToken(TokenType.DOT);
248 }
249 case '*':
250 return createToken(TokenType.STAR);
251 case '^':
252 return createToken(TokenType.POWER);
253 case '"':
254 if (peek("\"\"")) {
255 consume(2);
256 while (peek(' ') || peek('\t')) {
257 consume();
258 }
259 if (peek('\r')) {
260 consume();
261 if (peek('\n')) {
262 consume();
263 }
264 } else if (peek('\n')) {
265 consume();
266 } else {
267 throw exception("Expected line terminator");
268 }
269 lexeme = new ArrayList<>();
270 while (!peek("\"\"\"")) {
271 if (eof) {
272 throw exception(
273 String.format(
274 "Unterminated multi-line string starting at %s",
275 new Position(filename, startLine, startCol)));
276 } else if (peek('\r')) {
277 consume();
278 lexeme = lexeme.subList(0, lexeme.size() - 1);
279 lexeme.add((byte) '\n');
280 if (peek('\n')) {
281 consume();
282 lexeme = lexeme.subList(0, lexeme.size() - 1);
283 }
284 } else if (peek('\\')) {
285 consume();
286 if (input[index] < 32 || input[index] > 126) {
287 throw exception(String.format("Invalid escape byte 0x%02X", input[index]));
288 }
289 consume();
290 var lexemeString = getLexemeString();
291 String escapeSequence = lexemeString.substring(lexemeString.length() - 2);
292 lexeme = lexeme.subList(0, lexeme.size() - 2);
293 if (!escapeSequences.containsKey(escapeSequence)) {
294 throw exception(String.format("Invalid escape sequence '%s'", escapeSequence));
295 }
296 lexeme.add(escapeSequences.get(escapeSequence));
297 } else {
298 consume();
299 }
300 }
301 consume(3);
302 lexeme = lexeme.subList(0, lexeme.size() - 3);
303 return createToken(TokenType.MULTI_STRING);
304 }
305 while (!peek('"')) {
306 if (peek('\\')) {
307 consume();
308 if (eof || peek('\n')) {
309 throw exception(
310 String.format(
311 "Unterminated string starting at %s",
312 new Position(filename, startLine, startCol)));
313 }
314 if (input[index] < 32 || input[index] > 126) {
315 throw exception(String.format("Invalid escape byte 0x%02X", input[index]));
316 }
317 consume();
318 var lexemeString = getLexemeString();
319 String escapeSequence = lexemeString.substring(lexemeString.length() - 2);
320 lexeme = lexeme.subList(0, lexeme.size() - 2);
321 if (!escapeSequences.containsKey(escapeSequence)) {
322 throw exception(String.format("Invalid escape sequence '%s'", escapeSequence));
323 }
324 lexeme.add(escapeSequences.get(escapeSequence));
325 } else if (eof || peek('\n')) {
326 throw exception(
327 String.format(
328 "Unterminated string starting at %s",
329 new Position(filename, startLine, startCol)));
330 } else {
331 consume();
332 }
333 }
334 consume();
335 return createToken(TokenType.STRING);
336 default:
337 if (isAlpha(c)) {
338 while (isAlphaNumeric()) {
339 consume();
340 }
341 var lexemeString = getLexemeString();
342 if (keywords.containsKey(lexemeString)) {
343 return createToken(keywords.get(lexemeString));
344 } else {
345 return createToken(TokenType.ID);
346 }
347 } else if (isDigit(c)) {
348 while (isDigit()) {
349 consume();
350 }
351 if (peek("..") || !peek('.')) {
352 return createToken(TokenType.INT);
353 } else if (peek('.')) {
354 consume();
355 while (isDigit()) {
356 consume();
357 }
358 return createToken(TokenType.FLOAT);
359 }
360 }
361 if (c < 0) {
362 throw exception(String.format("Unexpected token 0x%02X", c));
363 } else {
364 throw exception(String.format("Unexpected token '%c'", (char) c));
365 }
366 }
367 }
368
369 private void consume(int n) {
370 for (int i = 0; i < n; i++) {
371 consume();
372 }
373 }
374
375 private byte consume() {
376 if (eof) {
377 throw new RuntimeException("Consuming past end-of-file");
378 }
379 if (input[index] == (byte) '\n') {
380 line++;
381 col = 1;
382 } else {
383 col++;
384 }
385 var c = input[index++];
386 lexeme.add(c);
387 if (index == input.length) {
388 eof = true;
389 }
390 return c;
391 }
392
393 private boolean peek(String s) {
394 var bytes = s.getBytes();
395 if (input.length - index < bytes.length) {
396 return false;
397 }
398 for (int i = 0; i < bytes.length; i++) {
399 if (bytes[i] != input[index + i]) {
400 return false;
401 }
402 }
403 return true;
404 }
405
406 private boolean peek(char c) {
407 return peek((byte) c);
408 }
409
410 private boolean peek(byte c) {
411 if (eof) {
412 return false;
413 } else {
414 return c == input[index];
415 }
416 }
417
418 private void createComment(TokenType type) {
419 var lexemeString = getLexemeString();
420 lexemeString = lexemeString.substring(2, lexemeString.length());
421 if (type == TokenType.MULTICOMMENT) {
422 lexemeString = lexemeString.substring(0, lexemeString.length() - 2);
423 }
424 comments.add(new Token(type, filename, startLine, startCol, lexemeString));
425 }
426
427 private Token createRawToken(TokenType type) {
428 switch (type) {
429 case INT:
430 return new Token(type, filename, startLine, startCol, Integer.parseInt(getLexemeString()));
431 case FLOAT:
432 return new Token(
433 type, filename, startLine, startCol, Double.parseDouble(getLexemeString()));
434 case ID:
435 return new Token(type, filename, startLine, startCol, getLexemeString());
436 case STRING:
437 {
438 var lexemeString = getLexemeString();
439 return new Token(
440 type,
441 filename,
442 startLine,
443 startCol,
444 lexemeString.substring(1, lexemeString.length() - 1));
445 }
446 case MULTI_STRING:
447 {
448 var lexemeString = getLexemeString();
449 var lines = lexemeString.split("\\R");
450
451
452 int minIndent = -1;
453 for (int i = 0; i < lines.length; i++) {
454 var line = lines[i];
455 if (!line.isBlank() || i + 1 == lines.length) {
456 var indent = 0;
457 for (int j = 0; j < line.length(); j++) {
458 if (!Character.isWhitespace(line.charAt(j))) {
459 break;
460 }
461 indent += 1;
462 }
463 if (minIndent == -1 || indent < minIndent) {
464 minIndent = indent;
465 }
466 }
467 }
468
469
470 var newLines = new String[lines.length];
471 for (int i = 0; i < lines.length; i++) {
472 var line = lines[i];
473 if (line.isBlank()) {
474 newLines[i] = "";
475 } else {
476 if (minIndent != -1) {
477 line = line.substring(minIndent);
478 }
479 newLines[i] = line.stripTrailing();
480 }
481 }
482
483 return new Token(
484 TokenType.STRING, filename, startLine, startCol, String.join("\n", newLines));
485 }
486 default:
487 return new Token(type, filename, startLine, startCol);
488 }
489 }
490
491 @SuppressWarnings("fallthrough")
492 private void readTrailingComments() throws CompilerException {
493
494
495
496 startLine = line;
497 startCol = col;
498 lexeme = new ArrayList<>();
499 if (eof || peek('\n')) {
500 return;
501 }
502 byte c = consume();
503 switch (c) {
504 case ' ':
505 case '\t':
506 readTrailingComments();
507 return;
508 case '/':
509 if (peek('/')) {
510 while (!eof && !peek('\n') && !peek('\r')) {
511 consume();
512 }
513 createComment(TokenType.SINGLECOMMENT);
514 if (peek("\r\n")) {
515 consume(2);
516 readTrailingComments();
517 } else if (peek('\n')) {
518 consume();
519 readTrailingComments();
520 }
521 return;
522 } else if (peek('*')) {
523 consume();
524 while (!peek("*/")) {
525 if (eof) {
526 throw exception(
527 String.format(
528 "Unterminated comment starting at %s",
529 new Position(filename, startLine, startCol)));
530 }
531 consume();
532 }
533 consume(2);
534 createComment(TokenType.MULTICOMMENT);
535 readTrailingComments();
536 return;
537 }
538
539 default:
540 index--;
541 col--;
542 eof = false;
543 return;
544 }
545 }
546
547 private Token createToken(TokenType type) throws CompilerException {
548 var token = createRawToken(type);
549 var preComments = List.copyOf(comments);
550 comments.clear();
551 readTrailingComments();
552 var postComments = List.copyOf(comments);
553 comments.clear();
554 return new Token(token, preComments, postComments);
555 }
556
557 private CompilerException exception(String msg) {
558 Position pos = null;
559 if (eof) {
560 pos = new Position(filename, line, col);
561 } else {
562 pos = new Position(filename, startLine, startCol);
563 }
564 LOGGER.error(pos, msg);
565 LOGGER.print();
566 return new CompilerException("There were syntax errors");
567 }
568
569 private boolean isDigit() {
570 if (eof) {
571 return false;
572 }
573 return isDigit(input[index]);
574 }
575
576 private boolean isDigit(byte c) {
577 return '0' <= c && c <= '9';
578 }
579
580 private boolean isAlpha(byte c) {
581 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_';
582 }
583
584 private boolean isAlphaNumeric() {
585 if (eof) {
586 return false;
587 }
588 return isAlphaNumeric(input[index]);
589 }
590
591 private boolean isAlphaNumeric(byte c) {
592 return isDigit(c) || isAlpha(c);
593 }
594 }