initial commit - SyntacticAnalysis - Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar

commit 8dd34c51ee973e61cb22a762411d74cd75802d44
Author: Matthew Carlson <matt@mcarlson.xyz>
Date:   Wed, 22 Jun 2022 20:01:03 -0400

initial commit

Diffstat:
A README.md  | 24 ++++++++++++++++++++++++
A SyntacticAnalysis.java  | 268 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A prog.txt  | 15 +++++++++++++++

3 files changed, 307 insertions(+), 0 deletions(-)
diff --git a/README.md b/README.md
@@ -0,0 +1,24 @@
+# SyntacticAnalyis
+
+Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar.
+
+The grammar in question:
+
+```
+<program> -> program begin <statement_list> end
+<statement_list> -> <statement> {;<statement>}
+<statement> -> <assignment_statement> | <if_statement> | <loop_statement>
+<assignment_statement> -> <variable> = <expression>
+<variable> -> identifier (An identifier is a string that begins with a letter followed by 0 or more letters and/or digits)
+<expression> -> <term> {(+|-) <term>}
+<term> -> <factor> {(* | /) <factor>}
+<factor> -> identifier | int_constant | (<expr>)
+<if_statement> -> if (<logic_expression>) then <statement>
+<logic_expression> -> <variable> (< | >) <variable> (Assume logic expressions have only less than or greater than operators)
+<loop_statement> -> loop (<logic_expression>) <statement>
+```
+The analyzer will prompt you for the name of the file which contains the code. It will then tell you whether or not there is one or more syntax errors.
+
+`prog.txt` is an example of code that contains no syntax errors.
+
+I wrote this back in 2020 for COSC3127, a course at Algoma University.
diff --git a/SyntacticAnalysis.java b/SyntacticAnalysis.java
@@ -0,0 +1,268 @@
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.Scanner;
+import java.util.StringTokenizer;
+
+public class SyntacticAnalysis {
+	public static class Parser {
+		private StringBuilder buffer;
+		private LexicalAnalyzer lexer = new LexicalAnalyzer();
+
+		private class LexicalAnalyzer {
+			// Only read tokens that *must* be read
+			public String read() {
+				if (buffer.length() == 0) return "";
+
+				// Lexeme will be from beginning of buffer (location of current token) to where whitespace is
+				int i = buffer.indexOf(" ");
+				String lexeme = buffer.substring(0, i);
+
+				// Delete lexeme from buffer since it is now stored in variable
+				buffer.delete(0, i + 1);
+
+				//Print what tokens are read for debugging purposes
+				//System.out.println(lexeme);
+
+				return lexeme;
+			}
+
+			/* Peek without deleting when choice is an option
+			 * E.g., A factor token can be identifier, integer constant, or expression
+             */
+			public String lookAhead() {
+				int i = buffer.indexOf(" ");
+				String lexeme = buffer.substring(0, i);
+
+				return lexeme;
+			}
+		}
+
+		public Parser(File file) {
+			Scanner sc = null;
+
+			try {
+				sc = new Scanner(file);
+				buffer = new StringBuilder((int)file.length());
+
+				while(sc.hasNext()) {
+					String token = sc.next();
+
+					// Since token character sequence may contain characters such as ()+-*/=;
+					// We need to further break down token to separate special character and rest
+                    // E.g., if(sum1<sum2) token will be further tokenized into if ( sum1 < sum2 )
+					// This way spacing doesn't matter
+					StringTokenizer tokens = new StringTokenizer(token, "()+-*/=;", true);
+
+					while (tokens.hasMoreTokens())
+						buffer.append(tokens.nextToken() + " ");
+				}
+			} catch (FileNotFoundException fnfe) {
+				System.out.print("File not found.");
+				fnfe.printStackTrace();
+			} finally {
+				sc.close();
+			}
+		}
+
+		// Returns program() since <program> -> program begin <statement_list> end is checked first
+		public boolean isValidCode() {
+			return program();
+		}
+
+		private boolean program() {
+			// Every program must have program followed by begin
+			if (!lexer.read().toLowerCase().equals("program")) return false;
+			if (!lexer.read().toLowerCase().equals("begin"))   return false;
+
+			if (!statementList()) return false;
+
+			// Every program must finish with end keyword
+			if (!lexer.read().toLowerCase().equals("end")) return false;
+
+			return true;
+		}
+
+		private boolean statementList() {
+			// At least one statement must be present in a statement list
+			if (!statement()) return false;
+
+			// Semicolon must be present if additional statements follow
+			while (lexer.lookAhead().equals(";")) {
+				lexer.read();
+
+				if (!statement()) return false;
+			}
+
+            return true;
+		}
+
+		private boolean statement() {
+			// Check for assignment statement
+			// Not using variable() method because we do not want to delete token
+			if (lexer.lookAhead().matches("[a-zA-Z][a-zA-Z0-9]*") && !isReservedWord(lexer.lookAhead())) {
+				if (!assignmentStatement()) return false;
+			} else if (lexer.lookAhead().equals("if")) {   // Check for if statement
+				if (!ifStatement()) return false;
+			} else if (lexer.lookAhead().equals("loop")) { // Check for loop statement
+				if (!loopStatement()) return false;
+			} else {
+				return false;                              // Can't be a statement if not any of the above
+			}
+
+			return true;
+		}
+
+		// <assignment_statement> -> <var> = <expr>
+		private boolean assignmentStatement() {
+			if (!variable()) return false;
+			if (!lexer.read().equals("=")) return false;
+			if (!expression()) return false;
+
+            return true;
+		}
+
+		// <if_statement> -> if (<logic_expr>) then <stmnt>
+		private boolean ifStatement() {
+			if (!lexer.read().equals("if"))   return false;
+			if (!lexer.read().equals("("))    return false;
+			if (!logicExpression())           return false;
+			if (!lexer.read().equals(")"))    return false;
+			if (!lexer.read().equals("then")) return false;
+			if (!statement())                 return false;
+
+            return true;
+		}
+
+		// <loop_statement> -> loop (<logic_expr>) <statement>
+		private boolean loopStatement() {
+			if (!lexer.read().equals("loop")) return false;
+			if (!lexer.read().equals("("))    return false;
+			if (!logicExpression())           return false;
+			if (!lexer.read().equals(")"))    return false;
+			if (!statement())                 return false;
+
+			return true;
+		}
+
+		// <expr> -> <term> {(+|-) <term>}
+		private boolean expression() {
+			if (!term()) return false;
+
+			// Since + is a regex key symbol it must be escaped
+			while (lexer.lookAhead().matches("\\+|-")) {
+				lexer.read();
+
+                if (!term()) return false;
+			}
+
+			return true;
+		}
+
+		// <logic_expr> -> <var> {(<|>) <var>}
+		private boolean logicExpression() {
+			if (!variable()) return false;
+			if (!(lexer.lookAhead().matches("<|>"))) return false;
+
+            lexer.read();
+
+			if (!variable()) return false;
+
+			return true;
+		}
+
+		// <term> -> <factor> {(*|/) <factor>}
+		private boolean term() {
+			if (!factor()) return false;
+
+			// * is also a key symbol in regex
+			while (lexer.lookAhead().matches("\\*|/")) {
+				lexer.read();
+
+				if (!factor()) return false;
+			}
+
+			return true;
+		}
+
+		// <factor> -> <identifier> | <int_const> | <expr>
+		private boolean factor() {
+			if (!identifier()) {
+				if (!integerConstant()) {
+					if (!lexer.lookAhead().equals("(")) return false;
+
+					lexer.read();
+
+					if (!expression()) return false;
+
+					if (!lexer.lookAhead().equals(")")) return false;
+
+					lexer.read();
+				}
+
+				return true;
+			}
+
+			return true;
+		}
+
+		// <var> -> <identifier>
+		private boolean variable() {
+			if (!identifier()) return false;
+
+			return true;
+		}
+
+		private boolean identifier() {
+			// Regular expression to determine valid identifier
+			// [a-zA-Z] means starts with any letter
+			// [a-zA-Z0-9]* means any amount of alphanumeric characters (0 - infinity)
+			if (!lexer.lookAhead().matches("[a-zA-Z][a-zA-Z0-9]*") && !isReservedWord(lexer.lookAhead())) return false;
+
+			lexer.read();
+
+			return true;
+		}
+
+		// A keyword is a word that cannot be used as an identifier
+		// This method is just as a safety measure and should be a feature of code parsers
+		private boolean isReservedWord(String identifier) {
+			// Regex will recognize these as valid identifiers when they really aren't
+			final String[] reservedWords = {"program", "begin", "end", "if", "loop"};
+
+			for (String rw: reservedWords) {
+				// Identifier is invalid if it is any of the above
+				if (identifier.toLowerCase().equals(rw)) return true;
+			}
+
+			return false;
+		}
+
+		// Maintaining consistency with methods for most symbols -- doesn't really need own method
+		private boolean integerConstant() {
+			// A regex to detect if string has only digits (i.e. is integer constant)
+			// \\d is escape sequence for digit and .* means any amount
+			if (!lexer.lookAhead().matches(".*\\d.*")) return false;
+
+			lexer.read();
+
+			return true;
+		}
+	}
+
+	// Test method
+	public static void main(String[] args) {
+		Scanner sc = new Scanner(System.in);
+
+		System.out.print("\nEnter file name: ");
+		String name = sc.next();
+
+		File file = new File(name);
+
+		Parser parser = new Parser(file);
+
+		if (parser.isValidCode())
+			System.out.println("\nThere are no syntax errors in the program.\n");
+		else
+			System.out.println("\nThe program contains one or more syntax errors.\n");
+	}
+}
diff --git a/prog.txt b/prog.txt
@@ -0,0 +1,15 @@
+program
+begin
+
+sum1 = var1 + var2;
+sum2 = var3 + var2 * 90;
+sum3 = (var2 + var1) * var3;
+
+if (sum1 < sum2) then
+    if (var1 > var2) then
+        var4 = sum2 - sum1;
+
+loop (var1 < var2)
+    var5 = var4/45
+
+end

	SyntacticAnalysis Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar
	git clone git://mattcarlson.org/repos/SyntacticAnalysis.git
	Log \| Files \| Refs \| README

A	README.md	\|	24	++++++++++++++++++++++++
A	SyntacticAnalysis.java	\|	268	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	prog.txt	\|	15	+++++++++++++++