View Javadoc
1   /*
2    * Copyright 2019-2022 Foreseeti AB <https://foreseeti.com>
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     https://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.mal_lang.lib;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.nio.charset.StandardCharsets;
22  import java.nio.file.Files;
23  import java.util.ArrayList;
24  import java.util.HashMap;
25  import java.util.List;
26  import java.util.Locale;
27  import java.util.Map;
28  
29  public class Lexer {
30    private MalLogger LOGGER;
31    private String filename;
32    private byte[] input;
33    private int index;
34    private int line;
35    private int col;
36    private int startLine;
37    private int startCol;
38    private List<Byte> lexeme;
39    private List<Token> comments = new ArrayList<>();
40    private boolean eof;
41  
42    private static Map<String, TokenType> keywords;
43  
44    static {
45      keywords = new HashMap<>();
46      keywords.put("include", TokenType.INCLUDE);
47      keywords.put("info", TokenType.INFO);
48      keywords.put("category", TokenType.CATEGORY);
49      keywords.put("abstract", TokenType.ABSTRACT);
50      keywords.put("asset", TokenType.ASSET);
51      keywords.put("extends", TokenType.EXTENDS);
52      keywords.put("associations", TokenType.ASSOCIATIONS);
53      keywords.put("let", TokenType.LET);
54      keywords.put("E", TokenType.EXIST);
55      keywords.put("C", TokenType.C);
56      keywords.put("I", TokenType.I);
57      keywords.put("A", TokenType.A);
58    }
59  
60    private static Map<String, Byte> escapeSequences;
61  
62    static {
63      escapeSequences = new HashMap<>();
64      escapeSequences.put("\\b", (byte) '\b');
65      escapeSequences.put("\\n", (byte) '\n');
66      escapeSequences.put("\\t", (byte) '\t');
67      escapeSequences.put("\\r", (byte) '\r');
68      escapeSequences.put("\\f", (byte) '\f');
69      escapeSequences.put("\\\"", (byte) '"');
70      escapeSequences.put("\\\\", (byte) '\\');
71    }
72  
73    public Lexer(File file) throws IOException {
74      this(file, file.getName(), false, false);
75    }
76  
77    public Lexer(File file, boolean verbose, boolean debug) throws IOException {
78      this(file, file.getName(), verbose, debug);
79    }
80  
81    public Lexer(File file, String relativeName) throws IOException {
82      this(file, relativeName, false, false);
83    }
84  
85    public Lexer(File file, String relativeName, boolean verbose, boolean debug) throws IOException {
86      Locale.setDefault(Locale.ROOT);
87      LOGGER = new MalLogger("LEXER", verbose, debug);
88      try {
89        LOGGER.debug(String.format("Creating lexer with file '%s'", relativeName));
90        if (!file.exists()) {
91          throw new IOException(String.format("%s: No such file or directory", relativeName));
92        }
93        this.filename = relativeName;
94        this.input = Files.readAllBytes(file.toPath());
95        this.index = 0;
96        this.line = 1;
97        this.col = 1;
98        this.eof = input.length == 0;
99      } catch (IOException e) {
100       LOGGER.print();
101       throw e;
102     }
103   }
104 
105   public static boolean syntacticallyEqual(Lexer l1, Lexer l2) {
106     try {
107       var tok1 = l1.next();
108       var tok2 = l2.next();
109       while (tok1.type != TokenType.EOF && tok2.type != TokenType.EOF) {
110         if (tok1.type != tok2.type
111             || !tok1.stringValue.equals(tok2.stringValue)
112             || tok1.intValue != tok2.intValue
113             || tok1.doubleValue != tok2.doubleValue) {
114           return false;
115         }
116         tok1 = l1.next();
117         tok2 = l2.next();
118       }
119       return tok1.type == TokenType.EOF && tok2.type == TokenType.EOF;
120     } catch (CompilerException e) {
121       return false;
122     }
123   }
124 
125   private String getLexemeString() {
126     byte[] byteArray = new byte[lexeme.size()];
127     for (int i = 0; i < lexeme.size(); i++) {
128       byteArray[i] = lexeme.get(i).byteValue();
129     }
130     return new String(byteArray, StandardCharsets.UTF_8);
131   }
132 
133   public Token next() throws CompilerException {
134     startLine = line;
135     startCol = col;
136     lexeme = new ArrayList<>();
137     if (eof) {
138       LOGGER.print();
139       return createToken(TokenType.EOF);
140     }
141     byte c = consume();
142     switch (c) {
143       case ' ':
144       case '\t':
145       case '\r':
146       case '\n':
147         return next();
148       case '#':
149         return createToken(TokenType.HASH);
150       case ':':
151         return createToken(TokenType.COLON);
152       case '{':
153         return createToken(TokenType.LCURLY);
154       case '}':
155         return createToken(TokenType.RCURLY);
156       case '+':
157         if (peek('>')) {
158           consume();
159           return createToken(TokenType.INHERIT);
160         } else {
161           return createToken(TokenType.PLUS);
162         }
163       case '-':
164         if (peek('>')) {
165           consume();
166           return createToken(TokenType.OVERRIDE);
167         } else if (peek("->")) {
168           consume(2);
169           return createToken(TokenType.RARROW);
170         } else {
171           return createToken(TokenType.MINUS);
172         }
173       case '&':
174         return createToken(TokenType.ALL);
175       case '|':
176         return createToken(TokenType.ANY);
177       case '!':
178         if (peek('E')) {
179           consume();
180           return createToken(TokenType.NOTEXIST);
181         } else {
182           throw exception("Expected 'E'");
183         }
184       case '@':
185         return createToken(TokenType.AT);
186       case '[':
187         return createToken(TokenType.LBRACKET);
188       case ']':
189         return createToken(TokenType.RBRACKET);
190       case '(':
191         return createToken(TokenType.LPAREN);
192       case ')':
193         return createToken(TokenType.RPAREN);
194       case ',':
195         return createToken(TokenType.COMMA);
196       case '<':
197         if (peek("--")) {
198           consume(2);
199           return createToken(TokenType.LARROW);
200         } else if (peek('-')) {
201           consume();
202           return createToken(TokenType.REQUIRE);
203         } else {
204           throw exception("Expected '-' or '--'");
205         }
206       case '=':
207         return createToken(TokenType.ASSIGN);
208       case '\\':
209         if (peek('/')) {
210           consume();
211           return createToken(TokenType.UNION);
212         } else {
213           throw exception("Expected '/'");
214         }
215       case '/':
216         if (peek('\\')) {
217           consume();
218           return createToken(TokenType.INTERSECT);
219         } else if (peek('/')) {
220           while (!eof && !peek('\n') && !peek('\r')) {
221             consume();
222           }
223           createComment(TokenType.SINGLECOMMENT);
224           return next();
225         } else if (peek('*')) {
226           consume();
227           while (!peek("*/")) {
228             if (eof) {
229               throw exception(
230                   String.format(
231                       "Unterminated comment starting at %s",
232                       new Position(filename, startLine, startCol)));
233             }
234             consume();
235           }
236           consume(2);
237           createComment(TokenType.MULTICOMMENT);
238           return next();
239         } else {
240           return createToken(TokenType.DIVIDE);
241         }
242       case '.':
243         if (peek('.')) {
244           consume();
245           return createToken(TokenType.RANGE);
246         } else {
247           return createToken(TokenType.DOT);
248         }
249       case '*':
250         return createToken(TokenType.STAR);
251       case '^':
252         return createToken(TokenType.POWER);
253       case '"':
254         if (peek("\"\"")) {
255           consume(2);
256           while (peek(' ') || peek('\t')) {
257             consume();
258           }
259           if (peek('\r')) {
260             consume();
261             if (peek('\n')) {
262               consume();
263             }
264           } else if (peek('\n')) {
265             consume();
266           } else {
267             throw exception("Expected line terminator");
268           }
269           lexeme = new ArrayList<>();
270           while (!peek("\"\"\"")) {
271             if (eof) {
272               throw exception(
273                   String.format(
274                       "Unterminated multi-line string starting at %s",
275                       new Position(filename, startLine, startCol)));
276             } else if (peek('\r')) {
277               consume();
278               lexeme = lexeme.subList(0, lexeme.size() - 1);
279               lexeme.add((byte) '\n');
280               if (peek('\n')) {
281                 consume();
282                 lexeme = lexeme.subList(0, lexeme.size() - 1);
283               }
284             } else if (peek('\\')) {
285               consume();
286               if (input[index] < 32 || input[index] > 126) {
287                 throw exception(String.format("Invalid escape byte 0x%02X", input[index]));
288               }
289               consume();
290               var lexemeString = getLexemeString();
291               String escapeSequence = lexemeString.substring(lexemeString.length() - 2);
292               lexeme = lexeme.subList(0, lexeme.size() - 2);
293               if (!escapeSequences.containsKey(escapeSequence)) {
294                 throw exception(String.format("Invalid escape sequence '%s'", escapeSequence));
295               }
296               lexeme.add(escapeSequences.get(escapeSequence));
297             } else {
298               consume();
299             }
300           }
301           consume(3);
302           lexeme = lexeme.subList(0, lexeme.size() - 3);
303           return createToken(TokenType.MULTI_STRING);
304         }
305         while (!peek('"')) {
306           if (peek('\\')) {
307             consume();
308             if (eof || peek('\n')) {
309               throw exception(
310                   String.format(
311                       "Unterminated string starting at %s",
312                       new Position(filename, startLine, startCol)));
313             }
314             if (input[index] < 32 || input[index] > 126) {
315               throw exception(String.format("Invalid escape byte 0x%02X", input[index]));
316             }
317             consume();
318             var lexemeString = getLexemeString();
319             String escapeSequence = lexemeString.substring(lexemeString.length() - 2);
320             lexeme = lexeme.subList(0, lexeme.size() - 2);
321             if (!escapeSequences.containsKey(escapeSequence)) {
322               throw exception(String.format("Invalid escape sequence '%s'", escapeSequence));
323             }
324             lexeme.add(escapeSequences.get(escapeSequence));
325           } else if (eof || peek('\n')) {
326             throw exception(
327                 String.format(
328                     "Unterminated string starting at %s",
329                     new Position(filename, startLine, startCol)));
330           } else {
331             consume();
332           }
333         }
334         consume();
335         return createToken(TokenType.STRING);
336       default:
337         if (isAlpha(c)) {
338           while (isAlphaNumeric()) {
339             consume();
340           }
341           var lexemeString = getLexemeString();
342           if (keywords.containsKey(lexemeString)) {
343             return createToken(keywords.get(lexemeString));
344           } else {
345             return createToken(TokenType.ID);
346           }
347         } else if (isDigit(c)) {
348           while (isDigit()) {
349             consume();
350           }
351           if (peek("..") || !peek('.')) {
352             return createToken(TokenType.INT);
353           } else if (peek('.')) {
354             consume();
355             while (isDigit()) {
356               consume();
357             }
358             return createToken(TokenType.FLOAT);
359           }
360         }
361         if (c < 0) {
362           throw exception(String.format("Unexpected token 0x%02X", c));
363         } else {
364           throw exception(String.format("Unexpected token '%c'", (char) c));
365         }
366     }
367   }
368 
369   private void consume(int n) {
370     for (int i = 0; i < n; i++) {
371       consume();
372     }
373   }
374 
375   private byte consume() {
376     if (eof) {
377       throw new RuntimeException("Consuming past end-of-file");
378     }
379     if (input[index] == (byte) '\n') {
380       line++;
381       col = 1;
382     } else {
383       col++;
384     }
385     var c = input[index++];
386     lexeme.add(c);
387     if (index == input.length) {
388       eof = true;
389     }
390     return c;
391   }
392 
393   private boolean peek(String s) {
394     var bytes = s.getBytes();
395     if (input.length - index < bytes.length) {
396       return false;
397     }
398     for (int i = 0; i < bytes.length; i++) {
399       if (bytes[i] != input[index + i]) {
400         return false;
401       }
402     }
403     return true;
404   }
405 
406   private boolean peek(char c) {
407     return peek((byte) c);
408   }
409 
410   private boolean peek(byte c) {
411     if (eof) {
412       return false;
413     } else {
414       return c == input[index];
415     }
416   }
417 
418   private void createComment(TokenType type) {
419     var lexemeString = getLexemeString();
420     lexemeString = lexemeString.substring(2, lexemeString.length());
421     if (type == TokenType.MULTICOMMENT) {
422       lexemeString = lexemeString.substring(0, lexemeString.length() - 2);
423     }
424     comments.add(new Token(type, filename, startLine, startCol, lexemeString));
425   }
426 
427   private Token createRawToken(TokenType type) {
428     switch (type) {
429       case INT:
430         return new Token(type, filename, startLine, startCol, Integer.parseInt(getLexemeString()));
431       case FLOAT:
432         return new Token(
433             type, filename, startLine, startCol, Double.parseDouble(getLexemeString()));
434       case ID:
435         return new Token(type, filename, startLine, startCol, getLexemeString());
436       case STRING:
437         {
438           var lexemeString = getLexemeString();
439           return new Token(
440               type,
441               filename,
442               startLine,
443               startCol,
444               lexemeString.substring(1, lexemeString.length() - 1));
445         }
446       case MULTI_STRING:
447         {
448           var lexemeString = getLexemeString();
449           var lines = lexemeString.split("\\R");
450 
451           // Find minIndent
452           int minIndent = -1;
453           for (int i = 0; i < lines.length; i++) {
454             var line = lines[i];
455             if (!line.isBlank() || i + 1 == lines.length) {
456               var indent = 0;
457               for (int j = 0; j < line.length(); j++) {
458                 if (!Character.isWhitespace(line.charAt(j))) {
459                   break;
460                 }
461                 indent += 1;
462               }
463               if (minIndent == -1 || indent < minIndent) {
464                 minIndent = indent;
465               }
466             }
467           }
468 
469           // Strip lines
470           var newLines = new String[lines.length];
471           for (int i = 0; i < lines.length; i++) {
472             var line = lines[i];
473             if (line.isBlank()) {
474               newLines[i] = "";
475             } else {
476               if (minIndent != -1) {
477                 line = line.substring(minIndent);
478               }
479               newLines[i] = line.stripTrailing();
480             }
481           }
482 
483           return new Token(
484               TokenType.STRING, filename, startLine, startCol, String.join("\n", newLines));
485         }
486       default:
487         return new Token(type, filename, startLine, startCol);
488     }
489   }
490 
491   @SuppressWarnings("fallthrough")
492   private void readTrailingComments() throws CompilerException {
493     // Trailing comments are all comments followed on the same line as the previous
494     // token, including comments that follow previous trailing comments by exactly 1
495     // line.
496     startLine = line;
497     startCol = col;
498     lexeme = new ArrayList<>();
499     if (eof || peek('\n')) {
500       return;
501     }
502     byte c = consume();
503     switch (c) {
504       case ' ':
505       case '\t':
506         readTrailingComments();
507         return;
508       case '/':
509         if (peek('/')) {
510           while (!eof && !peek('\n') && !peek('\r')) {
511             consume();
512           }
513           createComment(TokenType.SINGLECOMMENT);
514           if (peek("\r\n")) {
515             consume(2);
516             readTrailingComments();
517           } else if (peek('\n')) {
518             consume();
519             readTrailingComments();
520           }
521           return;
522         } else if (peek('*')) {
523           consume();
524           while (!peek("*/")) {
525             if (eof) {
526               throw exception(
527                   String.format(
528                       "Unterminated comment starting at %s",
529                       new Position(filename, startLine, startCol)));
530             }
531             consume();
532           }
533           consume(2);
534           createComment(TokenType.MULTICOMMENT);
535           readTrailingComments();
536           return;
537         }
538         // Not a comment, we want to fall-through
539       default:
540         index--;
541         col--;
542         eof = false;
543         return;
544     }
545   }
546 
547   private Token createToken(TokenType type) throws CompilerException {
548     var token = createRawToken(type);
549     var preComments = List.copyOf(comments);
550     comments.clear();
551     readTrailingComments();
552     var postComments = List.copyOf(comments);
553     comments.clear();
554     return new Token(token, preComments, postComments);
555   }
556 
557   private CompilerException exception(String msg) {
558     Position pos = null;
559     if (eof) {
560       pos = new Position(filename, line, col);
561     } else {
562       pos = new Position(filename, startLine, startCol);
563     }
564     LOGGER.error(pos, msg);
565     LOGGER.print();
566     return new CompilerException("There were syntax errors");
567   }
568 
569   private boolean isDigit() {
570     if (eof) {
571       return false;
572     }
573     return isDigit(input[index]);
574   }
575 
576   private boolean isDigit(byte c) {
577     return '0' <= c && c <= '9';
578   }
579 
580   private boolean isAlpha(byte c) {
581     return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_';
582   }
583 
584   private boolean isAlphaNumeric() {
585     if (eof) {
586       return false;
587     }
588     return isAlphaNumeric(input[index]);
589   }
590 
591   private boolean isAlphaNumeric(byte c) {
592     return isDigit(c) || isAlpha(c);
593   }
594 }