Parser

2024-11-08 10:20:41 +00:00 · 2021-01-27 10:03:03 +01:00 · 2021-01-27 10:03:03 +01:00 · 8b61d32838
commit 8b61d32838
parent 8700a83390
12 changed files with 439 additions and 1 deletions
--- a/src/main/java/edu/kit/typicalc/model/parser/LambdaLexer.java
+++ b/src/main/java/edu/kit/typicalc/model/parser/LambdaLexer.java
@ -0,0 +1,114 @@
+package edu.kit.typicalc.model.parser;
+
+import edu.kit.typicalc.model.parser.Token.TokenType;
+import edu.kit.typicalc.util.Result;
+
+/**
+ * This class lexes a term given as String into tokens.
+ * Tokens are lexed one by one as requested by the parser.
+ */
+public class LambdaLexer {
+    /**
+     * The given term as a String
+     */
+    private final String term;
+    /**
+     * current position in the term
+     */
+    private int pos = 0;
+
+    /**
+     * Constructs a lexer that lexes the given term
+     * @param term the term to lex
+     */
+    public LambdaLexer(String term) {
+        this.term = term;
+    }
+
+    /**
+     * Advances the current char to the next char in the term.
+     */
+    private void advance() {
+        pos += 1;
+    }
+
+    /**
+     * Lexes and returns the next token.
+     * @return the next token
+     */
+    public Result<Token, ParseError> nextToken() {
+        while (pos < term.length() && Character.isWhitespace(term.charAt(pos))) {
+            advance();
+        }
+        if (pos >= term.length()) {
+            // term ended, return EOF
+            return new Result<>(new Token(TokenType.EOF, "", pos));
+        }
+        Token t;
+        char c = term.charAt(pos);
+        switch (c) {
+            // bunch of single-character tokens
+            case '.':
+                t = new Token(TokenType.DOT, ".", pos);
+                advance();
+                return new Result<>(t);
+            case '(':
+                t = new Token(TokenType.LP, "(", pos);
+                advance();
+                return new Result<>(t);
+            case ')':
+                t = new Token(TokenType.RP, ")", pos);
+                advance();
+                return new Result<>(t);
+            case '=':
+                t = new Token(TokenType.EQ, "=", pos);
+                advance();
+                return new Result<>(t);
+            case '\\':
+            case 'λ':
+                t = new Token(TokenType.LAMBDA, c+"", pos);
+                advance();
+                return new Result<>(t);
+            default:
+                if (Character.isLetter(c)) {
+                    // identifier
+                    StringBuilder sb = new StringBuilder();
+                    do {
+                        sb.append(term.charAt(pos));
+                        advance();
+                    } while (pos < term.length() && Character.isLetterOrDigit(term.charAt(pos)));
+                    String s = sb.toString();
+                    TokenType type;
+                    switch (s) {
+                        case "let":
+                            type = TokenType.LET;
+                            break;
+                        case "in":
+                            type = TokenType.IN;
+                            break;
+                        case "true":
+                            type = TokenType.TRUE;
+                            break;
+                        case "false":
+                            type = TokenType.FALSE;
+                            break;
+                        default:
+                            type = TokenType.VARIABLE;
+                            break;
+                    }
+                    return new Result<>(new Token(type, sb.toString(), pos));
+                } else if (Character.isDigit(c)) {
+                    // number literal
+                    StringBuilder sb = new StringBuilder();
+                    do {
+                        sb.append(term.charAt(pos));
+                        advance();
+                    } while (pos < term.length() && Character.isDigit(term.charAt(pos)));
+                    return new Result<>(new Token(TokenType.NUMBER, sb.toString(), pos));
+                } else {
+                    //throw new ParseException("Illegal character '" + term.charAt(pos) + "'");
+                    return new Result<>(null, ParseError.UNEXPECTED_CHARACTER);
+                }
+        }
+    }
+}
--- a/src/main/java/edu/kit/typicalc/model/parser/LambdaParser.java
+++ b/src/main/java/edu/kit/typicalc/model/parser/LambdaParser.java
@ -0,0 +1,171 @@
+package edu.kit.typicalc.model.parser;
+
+import edu.kit.typicalc.model.parser.Token.TokenType;
+import edu.kit.typicalc.model.term.AbsTerm;
+import edu.kit.typicalc.model.term.AppTerm;
+import edu.kit.typicalc.model.term.BooleanTerm;
+import edu.kit.typicalc.model.term.IntegerTerm;
+import edu.kit.typicalc.model.term.LambdaTerm;
+import edu.kit.typicalc.model.term.LetTerm;
+import edu.kit.typicalc.model.term.VarTerm;
+import edu.kit.typicalc.util.Result;
+
+import java.util.EnumSet;
+import java.util.Optional;
+import java.util.Set;
+
+public class LambdaParser {
+    /**
+     * lexer to translate a String into tokens
+     */
+    private final LambdaLexer lexer;
+    /**
+     * Next token to use while parsing.
+     * The following invariant holds:
+     * When calling a parseX method, token is the first token of X
+     * (as opposed to the last token of the previous construct).
+     */
+    private Token token;
+
+    private static final Set<TokenType> atomStartTokens
+            = EnumSet.of(TokenType.VARIABLE, TokenType.NUMBER, TokenType.TRUE,
+            TokenType.FALSE, TokenType.LP);
+
+    /**
+     * Constructs a parser with the specified String
+     * @param term String to parse
+     */
+    public LambdaParser(String term) {
+        this.lexer = new LambdaLexer(term);
+        nextToken();
+    }
+
+    /**
+     * Sets token to the next available token.
+     */
+    private Optional<ParseError> nextToken() {
+        Result<Token, ParseError> nextToken = lexer.nextToken();
+        if (nextToken.isError()) {
+            return Optional.of(nextToken.unwrapError());
+        }
+        token = nextToken.unwrap();
+        return Optional.empty();
+    }
+
+    /**
+     * Checks that the token type of current token matches the token type given as parameter.
+     * If successful, returns that token and advances to the next token.
+     * Returns false otherwise.
+     * @param type the token type to compare the current token type to
+     */
+    private boolean expect(TokenType type) {
+        TokenType current = token.getType();
+        nextToken(); // TODO: Fehlerbehandlung
+        return current == type;
+    }
+
+    /**
+     * Parses the String given in the constructor as a term.
+     * @return the term given by the String
+     */
+    public Result<LambdaTerm, ParseError> parse() {
+        Result<LambdaTerm, ParseError> t = parseTerm();
+        if (!expect(TokenType.EOF)) {
+            return new Result<>(null, ParseError.TOO_MANY_TOKENS);
+        }
+        return t;
+    }
+
+    /**
+     * Parses a term.
+     * @return the term, or an error
+     */
+    private Result<LambdaTerm, ParseError> parseTerm() {
+        switch (token.getType()) {
+            case LAMBDA:
+                Result<AbsTerm, ParseError> abs = parseAbstraction();
+                return new Result<>(abs.unwrap(), abs.unwrapError());
+            case LET:
+                Result<LetTerm, ParseError> let = parseLet();
+                return new Result<>(let.unwrap(), let.unwrapError());
+            default:
+                return parseApplication();
+        }
+    }
+
+    private Result<AbsTerm, ParseError> parseAbstraction() {
+        nextToken();
+        Result<VarTerm, ParseError> var = parseVar();
+        if (!expect(TokenType.DOT)) {
+            // TODO
+        }
+        Result<LambdaTerm, ParseError> body = parseTerm();
+        // TODO: Fehlerbehandlung
+        return new Result(new AbsTerm(var.unwrap(), body.unwrap()));
+    }
+
+    /**
+     * Parses an application or constructs of higher precedence.
+     * @return the term, or an error
+     */
+    private Result<LambdaTerm, ParseError> parseApplication() {
+        LambdaTerm left = parseAtom().unwrap(); // TODO: Fehlerbehandlung
+        while (atomStartTokens.contains(token.getType())) {
+            LambdaTerm atom = parseAtom().unwrap(); // TODO: Fehlerbehandlung
+            left = new AppTerm(left, atom);
+        }
+        return new Result<>(left);
+    }
+
+    private Result<LetTerm, ParseError> parseLet() {
+        // TODO: Fehlerbehandlung
+        expect(TokenType.LET);
+        VarTerm var = parseVar().unwrap();
+        expect(TokenType.EQ);
+        LambdaTerm def = parseTerm().unwrap();
+        expect(TokenType.IN);
+        LambdaTerm body = parseTerm().unwrap();
+        return new Result<>(new LetTerm(var, def, body));
+    }
+
+    /**
+     * Parses an atom (variable or number) or a parenthesised expression.
+     * @return the term
+     */
+    private Result<LambdaTerm, ParseError> parseAtom() {
+        switch (token.getType()) {
+            case VARIABLE:
+                Result<VarTerm, ParseError> var = parseVar();
+                return new Result<>(var.unwrap(), var.unwrapError());
+            case NUMBER:
+                String number = token.getText();
+                int n;
+                try {
+                    n = Integer.parseInt(number);
+                } catch (NumberFormatException e) {
+                    return new Result<>(null, ParseError.UNEXPECTED_CHARACTER);
+                }
+                nextToken();
+                return new Result<>(new IntegerTerm(n));
+            case TRUE:
+            case FALSE:
+                String boolText = token.getText();
+                boolean b = Boolean.parseBoolean(boolText);
+                nextToken();
+                return new Result<>(new BooleanTerm(b));
+            default:
+                expect(TokenType.LP);
+                Result<LambdaTerm, ParseError> term = parseTerm();
+                expect(TokenType.RP);
+                return term;
+        }
+    }
+
+    private Result<VarTerm, ParseError> parseVar() {
+        String s = token.getText();
+        if (!expect(TokenType.VARIABLE)) {
+            return new Result<>(null, ParseError.UNEXPECTED_TOKEN);
+        }
+        return new Result<>(new VarTerm(s));
+    }
+}
--- a/src/main/java/edu/kit/typicalc/model/parser/ParseError.java
+++ b/src/main/java/edu/kit/typicalc/model/parser/ParseError.java
@ -0,0 +1,7 @@
+package edu.kit.typicalc.model.parser;
+
+public enum ParseError {
+    UNEXPECTED_TOKEN,
+    TOO_MANY_TOKENS,
+    UNEXPECTED_CHARACTER
+}
--- a/src/main/java/edu/kit/typicalc/model/parser/Token.java
+++ b/src/main/java/edu/kit/typicalc/model/parser/Token.java
@ -0,0 +1,78 @@
+package edu.kit.typicalc.model.parser;
+
+/**
+ * A token of the Prolog language.
+ */
+public class Token {
+    /**
+     * Used to distinguish what kind of token we have.
+     * Most of them stand for exactly one character.
+     * VARIABLE and NUMBER have a regular expression associated with them.
+     * EOF is a special token to indicate that the end of file is reached.
+     */
+    enum TokenType {
+        LAMBDA, // λ or a backslash
+        VARIABLE, // [a-z][a-zA-Z0-9]* except "let" or "in" or constants
+        LET, // let
+        IN, // in
+        TRUE, // true
+        FALSE, // false
+        NUMBER, // [0-9]+
+        LP, // (
+        RP, // )
+        DOT, // .
+        EQ, // =
+        EOF // pseudo token if end of file is reached
+    }
+
+    /**
+     * token type of this Token
+     */
+    private final TokenType type;
+    /**
+     * the text of this token in the source code
+     */
+    private final String text;
+    private final int pos;
+
+    /**
+     * Constructs a token.
+     * @param type the token type
+     * @param text text of this token in the source code
+     * @param pos position this token begins
+     */
+    public Token(TokenType type, String text, int pos) {
+        this.type = type;
+        this.text = text;
+        this.pos = pos;
+    }
+
+    /**
+     * Returns the token type
+     * @return token type
+     */
+    public TokenType getType() {
+        return type;
+    }
+
+    /**
+     * Returns the text of this token in the source code
+     * @return text of this token in the source code
+     */
+    public String getText() {
+        return text;
+    }
+
+    /**
+     * Returns the position this token is in
+     * @return position this token is in
+     */
+    public int getPos() {
+        return pos;
+    }
+
+    @Override
+    public String toString() {
+        return type + "(\"" + text + "\")";
+    }
+}
--- a/src/main/java/edu/kit/typicalc/model/term/AbsTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/AbsTerm.java
@ -0,0 +1,7 @@
+package edu.kit.typicalc.model.term;
+
+public class AbsTerm extends LambdaTerm {
+	public AbsTerm(VarTerm var, LambdaTerm body) {
+		// TODO
+	}
+}
--- a/src/main/java/edu/kit/typicalc/model/term/AppTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/AppTerm.java
@ -0,0 +1,6 @@
+package edu.kit.typicalc.model.term;
+
+public class AppTerm extends LambdaTerm {
+	public AppTerm(LambdaTerm left, LambdaTerm atom) {
+	}
+}
--- a/src/main/java/edu/kit/typicalc/model/term/BooleanTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/BooleanTerm.java
@ -0,0 +1,7 @@
+package edu.kit.typicalc.model.term;
+
+public class BooleanTerm extends ConstTerm {
+	public BooleanTerm(boolean value) {
+		// TODO
+	}
+}
--- a/src/main/java/edu/kit/typicalc/model/term/ConstTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/ConstTerm.java
@ -0,0 +1,4 @@
+package edu.kit.typicalc.model.term;
+
+public class ConstTerm extends LambdaTerm {
+}
--- a/src/main/java/edu/kit/typicalc/model/term/IntegerTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/IntegerTerm.java
@ -0,0 +1,7 @@
+package edu.kit.typicalc.model.term;
+
+public class IntegerTerm extends ConstTerm {
+	public IntegerTerm(int value) {
+		// TODO
+	}
+}
--- a/src/main/java/edu/kit/typicalc/model/term/LetTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/LetTerm.java
@ -0,0 +1,6 @@
+package edu.kit.typicalc.model.term;
+
+public class LetTerm extends LambdaTerm {
+	public LetTerm(VarTerm var, LambdaTerm def, LambdaTerm body) {
+	}
+}
--- a/src/main/java/edu/kit/typicalc/model/term/VarTerm.java
+++ b/src/main/java/edu/kit/typicalc/model/term/VarTerm.java
@ -1,4 +1,7 @@
 package edu.kit.typicalc.model.term;

-public class VarTerm {
+public class VarTerm extends LambdaTerm {
+	public VarTerm(String s) {
+		super();
+	}
 }
--- a/src/main/java/edu/kit/typicalc/util/Result.java
+++ b/src/main/java/edu/kit/typicalc/util/Result.java
@ -0,0 +1,28 @@
+package edu.kit.typicalc.util;
+
+public class Result<T, E> {
+	private final T value;
+	private final E error;
+
+	public Result(T value) {
+		this.value = value;
+		this.error = null;
+	}
+
+	public Result(T value, E error) { // TODO: Java does not allow both constructors otherwise
+		this.value = value;
+		this.error = error;
+	}
+
+	public boolean isError() {
+		return error != null;
+	}
+
+	public T unwrap() {
+		return value;
+	}
+
+	public E unwrapError() {
+		return error;
+	}
+}