SyntacticAnalysis

Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar
git clone git://mattcarlson.org/repos/SyntacticAnalysis.git
Log | Files | Refs | README

commit 8dd34c51ee973e61cb22a762411d74cd75802d44
Author: Matthew Carlson <matt@mcarlson.xyz>
Date:   Wed, 22 Jun 2022 20:01:03 -0400

initial commit

Diffstat:
AREADME.md | 24++++++++++++++++++++++++
ASyntacticAnalysis.java | 268+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aprog.txt | 15+++++++++++++++
3 files changed, 307 insertions(+), 0 deletions(-)

diff --git a/README.md b/README.md @@ -0,0 +1,24 @@ +# SyntacticAnalyis + +Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar. + +The grammar in question: + +``` +<program> -> program begin <statement_list> end +<statement_list> -> <statement> {;<statement>} +<statement> -> <assignment_statement> | <if_statement> | <loop_statement> +<assignment_statement> -> <variable> = <expression> +<variable> -> identifier (An identifier is a string that begins with a letter followed by 0 or more letters and/or digits) +<expression> -> <term> {(+|-) <term>} +<term> -> <factor> {(* | /) <factor>} +<factor> -> identifier | int_constant | (<expr>) +<if_statement> -> if (<logic_expression>) then <statement> +<logic_expression> -> <variable> (< | >) <variable> (Assume logic expressions have only less than or greater than operators) +<loop_statement> -> loop (<logic_expression>) <statement> +``` +The analyzer will prompt you for the name of the file which contains the code. It will then tell you whether or not there is one or more syntax errors. + +`prog.txt` is an example of code that contains no syntax errors. + +I wrote this back in 2020 for COSC3127, a course at Algoma University. diff --git a/SyntacticAnalysis.java b/SyntacticAnalysis.java @@ -0,0 +1,268 @@ +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Scanner; +import java.util.StringTokenizer; + +public class SyntacticAnalysis { + public static class Parser { + private StringBuilder buffer; + private LexicalAnalyzer lexer = new LexicalAnalyzer(); + + private class LexicalAnalyzer { + // Only read tokens that *must* be read + public String read() { + if (buffer.length() == 0) return ""; + + // Lexeme will be from beginning of buffer (location of current token) to where whitespace is + int i = buffer.indexOf(" "); + String lexeme = buffer.substring(0, i); + + // Delete lexeme from buffer since it is now stored in variable + buffer.delete(0, i + 1); + + //Print what tokens are read for debugging purposes + //System.out.println(lexeme); + + return lexeme; + } + + /* Peek without deleting when choice is an option + * E.g., A factor token can be identifier, integer constant, or expression + */ + public String lookAhead() { + int i = buffer.indexOf(" "); + String lexeme = buffer.substring(0, i); + + return lexeme; + } + } + + public Parser(File file) { + Scanner sc = null; + + try { + sc = new Scanner(file); + buffer = new StringBuilder((int)file.length()); + + while(sc.hasNext()) { + String token = sc.next(); + + // Since token character sequence may contain characters such as ()+-*/=; + // We need to further break down token to separate special character and rest + // E.g., if(sum1<sum2) token will be further tokenized into if ( sum1 < sum2 ) + // This way spacing doesn't matter + StringTokenizer tokens = new StringTokenizer(token, "()+-*/=;", true); + + while (tokens.hasMoreTokens()) + buffer.append(tokens.nextToken() + " "); + } + } catch (FileNotFoundException fnfe) { + System.out.print("File not found."); + fnfe.printStackTrace(); + } finally { + sc.close(); + } + } + + // Returns program() since <program> -> program begin <statement_list> end is checked first + public boolean isValidCode() { + return program(); + } + + private boolean program() { + // Every program must have program followed by begin + if (!lexer.read().toLowerCase().equals("program")) return false; + if (!lexer.read().toLowerCase().equals("begin")) return false; + + if (!statementList()) return false; + + // Every program must finish with end keyword + if (!lexer.read().toLowerCase().equals("end")) return false; + + return true; + } + + private boolean statementList() { + // At least one statement must be present in a statement list + if (!statement()) return false; + + // Semicolon must be present if additional statements follow + while (lexer.lookAhead().equals(";")) { + lexer.read(); + + if (!statement()) return false; + } + + return true; + } + + private boolean statement() { + // Check for assignment statement + // Not using variable() method because we do not want to delete token + if (lexer.lookAhead().matches("[a-zA-Z][a-zA-Z0-9]*") && !isReservedWord(lexer.lookAhead())) { + if (!assignmentStatement()) return false; + } else if (lexer.lookAhead().equals("if")) { // Check for if statement + if (!ifStatement()) return false; + } else if (lexer.lookAhead().equals("loop")) { // Check for loop statement + if (!loopStatement()) return false; + } else { + return false; // Can't be a statement if not any of the above + } + + return true; + } + + // <assignment_statement> -> <var> = <expr> + private boolean assignmentStatement() { + if (!variable()) return false; + if (!lexer.read().equals("=")) return false; + if (!expression()) return false; + + return true; + } + + // <if_statement> -> if (<logic_expr>) then <stmnt> + private boolean ifStatement() { + if (!lexer.read().equals("if")) return false; + if (!lexer.read().equals("(")) return false; + if (!logicExpression()) return false; + if (!lexer.read().equals(")")) return false; + if (!lexer.read().equals("then")) return false; + if (!statement()) return false; + + return true; + } + + // <loop_statement> -> loop (<logic_expr>) <statement> + private boolean loopStatement() { + if (!lexer.read().equals("loop")) return false; + if (!lexer.read().equals("(")) return false; + if (!logicExpression()) return false; + if (!lexer.read().equals(")")) return false; + if (!statement()) return false; + + return true; + } + + // <expr> -> <term> {(+|-) <term>} + private boolean expression() { + if (!term()) return false; + + // Since + is a regex key symbol it must be escaped + while (lexer.lookAhead().matches("\\+|-")) { + lexer.read(); + + if (!term()) return false; + } + + return true; + } + + // <logic_expr> -> <var> {(<|>) <var>} + private boolean logicExpression() { + if (!variable()) return false; + if (!(lexer.lookAhead().matches("<|>"))) return false; + + lexer.read(); + + if (!variable()) return false; + + return true; + } + + // <term> -> <factor> {(*|/) <factor>} + private boolean term() { + if (!factor()) return false; + + // * is also a key symbol in regex + while (lexer.lookAhead().matches("\\*|/")) { + lexer.read(); + + if (!factor()) return false; + } + + return true; + } + + // <factor> -> <identifier> | <int_const> | <expr> + private boolean factor() { + if (!identifier()) { + if (!integerConstant()) { + if (!lexer.lookAhead().equals("(")) return false; + + lexer.read(); + + if (!expression()) return false; + + if (!lexer.lookAhead().equals(")")) return false; + + lexer.read(); + } + + return true; + } + + return true; + } + + // <var> -> <identifier> + private boolean variable() { + if (!identifier()) return false; + + return true; + } + + private boolean identifier() { + // Regular expression to determine valid identifier + // [a-zA-Z] means starts with any letter + // [a-zA-Z0-9]* means any amount of alphanumeric characters (0 - infinity) + if (!lexer.lookAhead().matches("[a-zA-Z][a-zA-Z0-9]*") && !isReservedWord(lexer.lookAhead())) return false; + + lexer.read(); + + return true; + } + + // A keyword is a word that cannot be used as an identifier + // This method is just as a safety measure and should be a feature of code parsers + private boolean isReservedWord(String identifier) { + // Regex will recognize these as valid identifiers when they really aren't + final String[] reservedWords = {"program", "begin", "end", "if", "loop"}; + + for (String rw: reservedWords) { + // Identifier is invalid if it is any of the above + if (identifier.toLowerCase().equals(rw)) return true; + } + + return false; + } + + // Maintaining consistency with methods for most symbols -- doesn't really need own method + private boolean integerConstant() { + // A regex to detect if string has only digits (i.e. is integer constant) + // \\d is escape sequence for digit and .* means any amount + if (!lexer.lookAhead().matches(".*\\d.*")) return false; + + lexer.read(); + + return true; + } + } + + // Test method + public static void main(String[] args) { + Scanner sc = new Scanner(System.in); + + System.out.print("\nEnter file name: "); + String name = sc.next(); + + File file = new File(name); + + Parser parser = new Parser(file); + + if (parser.isValidCode()) + System.out.println("\nThere are no syntax errors in the program.\n"); + else + System.out.println("\nThe program contains one or more syntax errors.\n"); + } +} diff --git a/prog.txt b/prog.txt @@ -0,0 +1,15 @@ +program +begin + +sum1 = var1 + var2; +sum2 = var3 + var2 * 90; +sum3 = (var2 + var1) * var3; + +if (sum1 < sum2) then + if (var1 > var2) then + var4 = sum2 - sum1; + +loop (var1 < var2) + var5 = var4/45 + +end