SyntacticAnalysis.java - SyntacticAnalysis - Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar

SyntacticAnalysis.java (7486B)
      1 import java.io.File;
      2 import java.io.FileNotFoundException;
      3 import java.util.Scanner;
      4 import java.util.StringTokenizer;
      5 
      6 public class SyntacticAnalysis {
      7 	public static class Parser {
      8 		private StringBuilder buffer;
      9 		private LexicalAnalyzer lexer = new LexicalAnalyzer();
     10 
     11 		private class LexicalAnalyzer {
     12 			// Only read tokens that *must* be read
     13 			public String read() {
     14 				if (buffer.length() == 0) return "";
     15 
     16 				// Lexeme will be from beginning of buffer (location of current token) to where whitespace is
     17 				int i = buffer.indexOf(" ");
     18 				String lexeme = buffer.substring(0, i);
     19 
     20 				// Delete lexeme from buffer since it is now stored in variable
     21 				buffer.delete(0, i + 1);
     22 
     23 				//Print what tokens are read for debugging purposes
     24 				//System.out.println(lexeme);
     25 
     26 				return lexeme;
     27 			}
     28 
     29 			/* Peek without deleting when choice is an option
     30 			 * E.g., A factor token can be identifier, integer constant, or expression
     31              */
     32 			public String lookAhead() {
     33 				int i = buffer.indexOf(" ");
     34 				String lexeme = buffer.substring(0, i);
     35 
     36 				return lexeme;
     37 			}
     38 		}
     39 
     40 		public Parser(File file) {
     41 			Scanner sc = null;
     42 
     43 			try {
     44 				sc = new Scanner(file);
     45 				buffer = new StringBuilder((int)file.length());
     46 
     47 				while(sc.hasNext()) {
     48 					String token = sc.next();
     49 
     50 					// Since token character sequence may contain characters such as ()+-*/=;
     51 					// We need to further break down token to separate special character and rest
     52                     // E.g., if(sum1<sum2) token will be further tokenized into if ( sum1 < sum2 )
     53 					// This way spacing doesn't matter
     54 					StringTokenizer tokens = new StringTokenizer(token, "()+-*/=;", true);
     55 
     56 					while (tokens.hasMoreTokens())
     57 						buffer.append(tokens.nextToken() + " ");
     58 				}
     59 			} catch (FileNotFoundException fnfe) {
     60 				System.out.print("File not found.");
     61 				fnfe.printStackTrace();
     62 			} finally {
     63 				sc.close();
     64 			}
     65 		}
     66 
     67 		// Returns program() since <program> -> program begin <statement_list> end is checked first
     68 		public boolean isValidCode() {
     69 			return program();
     70 		}
     71 
     72 		private boolean program() {
     73 			// Every program must have program followed by begin
     74 			if (!lexer.read().toLowerCase().equals("program")) return false;
     75 			if (!lexer.read().toLowerCase().equals("begin"))   return false;
     76 
     77 			if (!statementList()) return false;
     78 
     79 			// Every program must finish with end keyword
     80 			if (!lexer.read().toLowerCase().equals("end")) return false;
     81 
     82 			return true;
     83 		}
     84 
     85 		private boolean statementList() {
     86 			// At least one statement must be present in a statement list
     87 			if (!statement()) return false;
     88 
     89 			// Semicolon must be present if additional statements follow
     90 			while (lexer.lookAhead().equals(";")) {
     91 				lexer.read();
     92 
     93 				if (!statement()) return false;
     94 			}
     95 
     96             return true;
     97 		}
     98 
     99 		private boolean statement() {
    100 			// Check for assignment statement
    101 			// Not using variable() method because we do not want to delete token
    102 			if (lexer.lookAhead().matches("[a-zA-Z][a-zA-Z0-9]*") && !isReservedWord(lexer.lookAhead())) {
    103 				if (!assignmentStatement()) return false;
    104 			} else if (lexer.lookAhead().equals("if")) {   // Check for if statement
    105 				if (!ifStatement()) return false;
    106 			} else if (lexer.lookAhead().equals("loop")) { // Check for loop statement
    107 				if (!loopStatement()) return false;
    108 			} else {
    109 				return false;                              // Can't be a statement if not any of the above
    110 			}
    111 
    112 			return true;
    113 		}
    114 
    115 		// <assignment_statement> -> <var> = <expr>
    116 		private boolean assignmentStatement() {
    117 			if (!variable()) return false;
    118 			if (!lexer.read().equals("=")) return false;
    119 			if (!expression()) return false;
    120 
    121             return true;
    122 		}
    123 
    124 		// <if_statement> -> if (<logic_expr>) then <stmnt>
    125 		private boolean ifStatement() {
    126 			if (!lexer.read().equals("if"))   return false;
    127 			if (!lexer.read().equals("("))    return false;
    128 			if (!logicExpression())           return false;
    129 			if (!lexer.read().equals(")"))    return false;
    130 			if (!lexer.read().equals("then")) return false;
    131 			if (!statement())                 return false;
    132 
    133             return true;
    134 		}
    135 
    136 		// <loop_statement> -> loop (<logic_expr>) <statement>
    137 		private boolean loopStatement() {
    138 			if (!lexer.read().equals("loop")) return false;
    139 			if (!lexer.read().equals("("))    return false;
    140 			if (!logicExpression())           return false;
    141 			if (!lexer.read().equals(")"))    return false;
    142 			if (!statement())                 return false;
    143 
    144 			return true;
    145 		}
    146 
    147 		// <expr> -> <term> {(+|-) <term>}
    148 		private boolean expression() {
    149 			if (!term()) return false;
    150 
    151 			// Since + is a regex key symbol it must be escaped
    152 			while (lexer.lookAhead().matches("\\+|-")) {
    153 				lexer.read();
    154 
    155                 if (!term()) return false;
    156 			}
    157 
    158 			return true;
    159 		}
    160 
    161 		// <logic_expr> -> <var> {(<|>) <var>}
    162 		private boolean logicExpression() {
    163 			if (!variable()) return false;
    164 			if (!(lexer.lookAhead().matches("<|>"))) return false;
    165 
    166             lexer.read();
    167 
    168 			if (!variable()) return false;
    169 
    170 			return true;
    171 		}
    172 
    173 		// <term> -> <factor> {(*|/) <factor>}
    174 		private boolean term() {
    175 			if (!factor()) return false;
    176 
    177 			// * is also a key symbol in regex
    178 			while (lexer.lookAhead().matches("\\*|/")) {
    179 				lexer.read();
    180 
    181 				if (!factor()) return false;
    182 			}
    183 
    184 			return true;
    185 		}
    186 
    187 		// <factor> -> <identifier> | <int_const> | <expr>
    188 		private boolean factor() {
    189 			if (!identifier()) {
    190 				if (!integerConstant()) {
    191 					if (!lexer.lookAhead().equals("(")) return false;
    192 
    193 					lexer.read();
    194 
    195 					if (!expression()) return false;
    196 
    197 					if (!lexer.lookAhead().equals(")")) return false;
    198 
    199 					lexer.read();
    200 				}
    201 
    202 				return true;
    203 			}
    204 
    205 			return true;
    206 		}
    207 
    208 		// <var> -> <identifier>
    209 		private boolean variable() {
    210 			if (!identifier()) return false;
    211 
    212 			return true;
    213 		}
    214 
    215 		private boolean identifier() {
    216 			// Regular expression to determine valid identifier
    217 			// [a-zA-Z] means starts with any letter
    218 			// [a-zA-Z0-9]* means any amount of alphanumeric characters (0 - infinity)
    219 			if (!lexer.lookAhead().matches("[a-zA-Z][a-zA-Z0-9]*") && !isReservedWord(lexer.lookAhead())) return false;
    220 
    221 			lexer.read();
    222 
    223 			return true;
    224 		}
    225 
    226 		// A keyword is a word that cannot be used as an identifier
    227 		// This method is just as a safety measure and should be a feature of code parsers
    228 		private boolean isReservedWord(String identifier) {
    229 			// Regex will recognize these as valid identifiers when they really aren't
    230 			final String[] reservedWords = {"program", "begin", "end", "if", "loop"};
    231 
    232 			for (String rw: reservedWords) {
    233 				// Identifier is invalid if it is any of the above
    234 				if (identifier.toLowerCase().equals(rw)) return true;
    235 			}
    236 
    237 			return false;
    238 		}
    239 
    240 		// Maintaining consistency with methods for most symbols -- doesn't really need own method
    241 		private boolean integerConstant() {
    242 			// A regex to detect if string has only digits (i.e. is integer constant)
    243 			// \\d is escape sequence for digit and .* means any amount
    244 			if (!lexer.lookAhead().matches(".*\\d.*")) return false;
    245 
    246 			lexer.read();
    247 
    248 			return true;
    249 		}
    250 	}
    251 
    252 	// Test method
    253 	public static void main(String[] args) {
    254 		Scanner sc = new Scanner(System.in);
    255 
    256 		System.out.print("\nEnter file name: ");
    257 		String name = sc.next();
    258 
    259 		File file = new File(name);
    260 
    261 		Parser parser = new Parser(file);
    262 
    263 		if (parser.isValidCode())
    264 			System.out.println("\nThere are no syntax errors in the program.\n");
    265 		else
    266 			System.out.println("\nThe program contains one or more syntax errors.\n");
    267 	}
    268 }
	SyntacticAnalysis Lexical analyzer that tokenizes code to see if it follows the rules of a certain EBNF grammar
	git clone git://mattcarlson.org/repos/SyntacticAnalysis.git
	Log \| Files \| Refs \| README