There were some outstandingly good solutions handed in, but several people had clearly not found time or put in the effort to complete the assignment. This showed up very clearly in the little prac test, which revealed, rather sadly and surprisingly, that there seemed to be some students who did not have the first clue about how to write a simple recursive descent parser for a tiny system of productions. Please study the solutions to the prac test carefully - they are all on the WWW pages for the course under the heading of "Prac Tests".
Another point that I noticed was that many people, both in the test and in the prac, were driving their parsers from the back, so to speak. Given a construction like
A = { "start" Something } "follow" .
it is far better to produce a parser routine like
while (sym.kind == startSym) { getSym(); something(); } accept(followSym);
than one like
while (sym.kind != followSym) { getSym(); something(); } accept(followSym);
for the simple reason that there might be something wrong with something!
Complete source code for three possible solutions to the prac are available on the WWW pages in files PRAC23A.ZIP (Java) or PRAC23AC.ZIP (C#).
Here is the important part of the code for the scanner. Most people got a lot of this correct - except perhaps for the comment handling (which you had been told to read up about in the textbook, so that failing to get that right was just silly). There was some funny clumsy code dealing with the "escaped" tokens - please have a good look at what I suggest.
// +++++++++++++++++++++++ token kinds enumeration +++++++++++++++++++++++++ static final int noSym = 0, EOFSym = 1, SemicolonSym = 2, BarSym = 3, LParenSym = 4, LBrackSym = 5, RParenSym = 6, RBrackSym = 7, ConcatSym = 8, OptionSym = 9, RepeatSym = 10, PlusSym = 11, HyphenSym = 12, AtomSym = 13, EscapedSym = 14; // +++++++++++++++++++++++++++++++ Scanner ++++++++++++++++++++++++++++++++++ // Declaring sym as a global variable is done for expediency - global variables // are not always a good thing static Token sym; static void getSym() { // Scans for next sym from input while (ch > EOF && ch <= ' ') getChar(); // comment handlers have to be carefully written - we must check for unclosed // comments, as well as making sure that we really do find a token. Note the // recursive call to getSym if (ch == '\\') { // skip comment do getChar(); while (ch != '\\' && ch != EOF); if (ch != EOF) { // try again getChar(); getSym(); return; } else { // give up sym = new Token(EOFSym, "EOF"); // perhaps abort("comment was not terminated"); return; } } StringBuffer symLex = new StringBuffer(); int symKind = noSym; symLex.append(ch); switch (ch) { case EOF : symKind = EOFSym; symLex = new StringBuffer("EOF"); break; // no need to getChar here, of course case ';' : symKind = SemicolonSym; getChar(); break; case '|' : symKind = BarSym; getChar(); break; case '(' : symKind = LParenSym; getChar(); break; case '[' : symKind = LBrackSym; getChar(); break; case ')' : symKind = RParenSym; getChar(); break; case ']' : symKind = RBrackSym; getChar(); break; case '.' : symKind = ConcatSym; getChar(); break; case '?' : symKind = OptionSym; getChar(); break; case '*' : symKind = RepeatSym; getChar(); break; case '+' : symKind = PlusSym; getChar(); break; case '-' : symKind = HyphenSym; getChar(); break; // escaped characters need special treatment - we must check for matching // quote marks case '"' : case '\'' : char quote = ch; getChar(); symLex.append(ch); if (ch < ' ') break; // escaped exclude control chars getChar(); symLex.append(ch); if (ch == quote) { symKind = EscapedSym; // else noSym already GetChar(); } break; // unusually, perhaps, the default option mops up all other possibilities as // valid. Very often default handlers must mop up errors default : symKind = AtomSym; getChar(); break; } sym = new Token(symKind, symLex.toString()); }
and here is code for a simple parser with brutal error handling:
// +++++++++++++++++++++++++++++++ Parser +++++++++++++++++++++++++++++++++++ static void accept(int wantedSym, String errorMessage) { // Checks that lookahead token is wantedSym if (sym.kind == wantedSym) getSym(); else abort(errorMessage); } static SymSet FirstExpression = new SymSet(new int[] {AtomSym, EscapedSym, LBrackSym, LParenSym}), FirstFactor = new SymSet(new int[] {AtomSym, EscapedSym, LBrackSym, LParenSym}), FirstAtom = new SymSet(new int[] {AtomSym, EscapedSym}), FirstOneRange = new SymSet(new int[] {AtomSym, EscapedSym}), Iterators = new SymSet(new int[] {RepeatSym, OptionSym, PlusSym}); static void RE () { // RE == { Expression ";" } EOF . // Note that we drive the loop on a FirstExpression check, and not on // while (sym != EOFSym) ... // Parser routines should normally be driven positively on FIRST sets, not negatively // on the use of FOLLOW sets while (FirstExpression.contains(sym.kind)) { Expression(); accept(SemicolonSym, "; expected"); } accept(EOFSym, "EOF expected"); } static void Expression () { // Expression = Term { "|" Term } . Term(); while (sym.kind == BarSym) { accept(BarSym, "| expected"); Term(); } } static void Term () { // Term = Factor { [ "." ] Factor } . Factor(); while (FirstFactor.contains(sym.kind) || sym.kind == ConcatSym) { if (sym.kind == ConcatSym) getSym(); Factor(); } } static void Factor () { // Factor == Element [ "*" | "?" | "+" ] . Element(); if (Iterators.contains(sym.kind)) getSym(); } static void Element () { // Element == Atom | Range | "(" Expression ")" . if (FirstAtom.contains(sym.kind)) Atom(); else if (sym.kind == LBrackSym) Range(); else if (sym.kind == LParenSym) { getSym(); Expression(); accept(RParenSym, ") expected"); } else abort("Invalid Start to Element "); } static void Range () { // Range == "[" OneRange { OneRange } "]" . accept(LBrackSym, "[ expected"); OneRange(); while (FirstOneRange.contains(sym.kind)) OneRange(); accept(RBrackSym, "] expected"); } static void OneRange () { // OneRange == Atom [ "-" Atom ] . Atom(); if (sym.kind == HyphenSym) { getSym(); Atom(); } } static void Atom () { // Atom = atomic | escaped . // Note the use of a default option to mop up errors switch (sym.kind) { case AtomSym : getSym(); break; case EscapedSym : getSym(); break; default : abort("Invalid Atom"); break; } }
To incorporate "follow set" error recovery we could try something like this:
// +++++++++++++++++++++++++++++++ Parser +++++++++++++++++++++++++++++++++++ static void accept(int wantedSym, String errorMessage) { // Checks that lookahead token is wantedSym if (sym.kind == wantedSym) getSym(); else reportError(errorMessage); } static void test(SymSet allowed, SymSet beacons, String errorMessage) { if (allowed.contains(sym.kind)) return; reportError(errorMessage); SymSet stopSet = allowed.union(beacons); while (!stopSet.contains(sym.kind)) getSym(); } static SymSet FirstExpression = new SymSet(new int[] {AtomSym, EscapedSym, LBrackSym, LParenSym}), FirstElement = new SymSet(new int[] {AtomSym, EscapedSym, LBrackSym, LParenSym}), FirstFactor = new SymSet(new int[] {AtomSym, EscapedSym, LBrackSym, LParenSym}), FirstRestFactor = new SymSet(new int[] {AtomSym, EscapedSym, LBrackSym, LParenSym, ConcatSym}), FirstAtom = new SymSet(new int[] {AtomSym, EscapedSym}), FirstOneRange = new SymSet(new int[] {AtomSym, EscapedSym}), FirstRange = new SymSet(new int[] {LBrackSym}), Iterators = new SymSet(new int[] {RepeatSym, OptionSym, PlusSym}); static void RE (SymSet followers) { // RE == { Expression ";" } EOF . test(followers.union(FirstExpression), new SymSet(), "bad start to file"); // concession while (FirstExpression.contains(sym.kind)) { Expression(followers.union(new SymSet(new int[] {SemicolonSym}))); accept(SemicolonSym, "; expected"); } accept(EOFSym, "EOF expected"); } static void Expression (SymSet followers) { // Expression = Term { "|" Term } . Term(followers.union(new SymSet(new int[] {BarSym}))); while (sym.kind == BarSym) { accept(BarSym, "| expected"); Term(followers.union(new SymSet(new int[] {BarSym}))); } } static void Term (SymSet followers) { // Term = Factor { [ "." ] Factor } . Factor(followers.union(FirstRestFactor)); while (FirstFactor.contains(sym.kind) || sym.kind == ConcatSym) { if (sym.kind == ConcatSym) getSym(); Factor(followers.union(FirstRestFactor)); } } static void Factor (SymSet followers) { // Factor == Element [ "*" | "?" | "+" ] . Element(followers.union(Iterators)); if (Iterators.contains(sym.kind)) getSym(); } static void Element (SymSet followers) { // Element == Atom | Range | "(" Expression ")" . test(FirstElement, followers, "Invalid Start to Element"); if (FirstAtom.contains(sym.kind)) Atom(followers); else if (sym.kind == LBrackSym) Range(followers); else if (sym.kind == LParenSym) { getSym(); Expression(followers.union(new SymSet(new int[] {RParenSym}))); accept(RParenSym, ") expected"); } test(followers, new SymSet(), "unexpected symbol"); } static void Range (SymSet followers) { // Range == "[" OneRange { OneRange } "]" . accept(LBrackSym, "[ expected"); OneRange(followers.union(new SymSet(new int[] {RBrackSym}))); while (FirstOneRange.contains(sym.kind)) OneRange(followers.union(new SymSet(new int[] {RBrackSym}))); accept(RBrackSym, "] expected"); } static void OneRange (SymSet followers) { // OneRange == Atom [ "-" Atom ] . Atom(followers.union(new SymSet(new int[] {HyphenSym}))); if (sym.kind == HyphenSym) { getSym(); Atom(followers); } } static void Atom (SymSet followers) { // Atom = atomic | escaped . // Note the use of a default option to mop up errors switch (sym.kind) { case AtomSym : getSym(); break; case EscapedSym : getSym(); break; default : reportError("Invalid Atom"); break; } } .... getChar(); // Lookahead character getSym(); // Lookahead symbol RE(new SymSet(new int[] {EOFSym})); // Start to parse from the goal symbol if (!errors) System.out.println("Parsed correctly");