There were some very good (but no perfect) solutions handed in, but several people had not absorbed all the detail. This showed up very clearly in the little prac test, which revealed, rather sadly and surprisingly, that there seemed to be some students who did not have the first clue about how to write a simple recursive descent parser for a tiny system of productions. Please study the solutions to the prac test carefully - they are all on the WWW pages for the course under the heading of "Prac Tests".
Another point that I noticed was that many people, both in the test and in the prac, were driving their parsers from the back, so to speak. Given a construction like
A = { "start" Something } "follow" .
it is far better to produce a parser routine like
while (sym.kind == startSym) { getSym(); something(); } accept(followSym);
than one like
while (sym.kind != followSym) { getSym(); something(); } accept(followSym);
for the simple reason that there might be something wrong with something!
Complete source code for three possible solutions to the prac are available on the WWW pages in files PRAC23A.ZIP (Java) or PRAC23AC.ZIP (C#).
Here is the important part of the code for the scanner. Most people got a lot of this correct - except perhaps for the comment handling (which you had been told to read up about in the textbook, so that failing to get that right was just silly). There was some funny clumsy code dealing with the "escaped" tokens - please have a good look at what I suggest, and in particular that one had to "match" the quotation marks.
// +++++++++++++++++++++++ token kinds enumeration +++++++++++++++++++++++++ static final int noSym = 0, EOFSym = 1, SemicolonSym = 2, BarSym = 3, LParenSym = 4, LBrackSym = 5, RParenSym = 6, RBrackSym = 7, ConcatSym = 8, OptionSym = 9, RepeatSym = 10, PlusSym = 11, HyphenSym = 12, AtomSym = 13, EscapedSym = 14; // +++++++++++++++++++++++++++++++ Scanner ++++++++++++++++++++++++++++++++++ // Declaring sym as a global variable is done for expediency - global variables // are not always a good thing static Token sym; static void getSym() { // Scans for next sym from input while (ch > EOF && ch <= ' ') getChar(); // comment handlers have to be carefully written - we must check for unclosed // comments, as well as making sure that we really do find a token. Note the // recursive call to getSym if (ch == '\\') { // skip comment do getChar(); while (ch != '\\' && ch != EOF); if (ch != EOF) { // try again getChar(); getSym(); return; } else { // give up sym = new Token(EOFSym, "EOF"); // perhaps abort("comment was not terminated"); return; } } StringBuilder symLex = new StringBuilder(); int symKind = noSym; symLex.append(ch); switch (ch) { case EOF : symKind = EOFSym; symLex = new StringBuilder("EOF"); break; // no need to getChar here, of course case ';' : symKind = SemicolonSym; getChar(); break; case '|' : symKind = BarSym; getChar(); break; case '(' : symKind = LParenSym; getChar(); break; case '[' : symKind = LBrackSym; getChar(); break; case ')' : symKind = RParenSym; getChar(); break; case ']' : symKind = RBrackSym; getChar(); break; case '.' : symKind = ConcatSym; getChar(); break; case '?' : symKind = OptionSym; getChar(); break; case '*' : symKind = RepeatSym; getChar(); break; case '+' : symKind = PlusSym; getChar(); break; case '-' : symKind = HyphenSym; getChar(); break; // escaped characters need special treatment - we must check for matching // quote marks case '"' : case '\'' : char quote = ch; getChar(); symLex.append(ch); if (ch < ' ') break; // escaped exclude control chars getChar(); symLex.append(ch); if (ch == quote) { symKind = EscapedSym; // else noSym already GetChar(); } break; // unusually, perhaps, the default option mops up all other possibilities as // valid. Very often default handlers must mop up errors default : symKind = AtomSym; getChar(); break; } sym = new Token(symKind, symLex.toString()); }
and here is code for a simple parser with brutal error handling:
// +++++++++++++++++++++++++++++++ Parser +++++++++++++++++++++++++++++++++++ static void accept(int wantedSym, String errorMessage) { // Checks that lookahead token is wantedSym if (sym.kind == wantedSym) getSym(); else abort(errorMessage); } static IntSet FirstExpression = new IntSet(AtomSym, EscapedSym, LBrackSym, LParenSym), FirstFactor = new IntSet(AtomSym, EscapedSym, LBrackSym, LParenSym), FirstAtom = new IntSet(AtomSym, EscapedSym), FirstOneRange = new IntSet(AtomSym, EscapedSym), Iterators = new IntSet(RepeatSym, OptionSym, PlusSym); static void RE () { // RE == { Expression ";" } EOF . // Note that we drive the loop on a FirstExpression check, and not on // while (sym != EOFSym) ... // Parser routines should normally be driven positively on FIRST sets, not negatively // on the use of FOLLOW sets while (FirstExpression.contains(sym.kind)) { Expression(); accept(SemicolonSym, "; expected"); } accept(EOFSym, "EOF expected"); } static void Expression () { // Expression = Term { "|" Term } . Term(); while (sym.kind == BarSym) { getSym(); Term(); } } static void Term () { // Term = Factor { [ "." ] Factor } . Factor(); while (FirstFactor.contains(sym.kind) || sym.kind == ConcatSym) { if (sym.kind == ConcatSym) getSym(); Factor(); } } static void Factor () { // Factor == Element [ "*" | "?" | "+" ] . Element(); if (Iterators.contains(sym.kind)) getSym(); } static void Element () { // Element == Atom | Range | "(" Expression ")" . if (FirstAtom.contains(sym.kind)) Atom(); else if (sym.kind == LBrackSym) Range(); else if (sym.kind == LParenSym) { getSym(); Expression(); accept(RParenSym, ") expected"); } else abort("Invalid Start to Element "); } static void Range () { // Range == "[" OneRange { OneRange } "]" . accept(LBrackSym, "[ expected"); OneRange(); while (FirstOneRange.contains(sym.kind)) OneRange(); accept(RBrackSym, "] expected"); } static void OneRange () { // OneRange == Atom [ "-" Atom ] . Atom(); if (sym.kind == HyphenSym) { getSym(); Atom(); } } static void Atom () { // Atom = atomic | escaped . // Note the use of a default option to mop up errors switch (sym.kind) { case AtomSym : getSym(); break; case EscapedSym : getSym(); break; default : abort("Invalid Atom"); break; } }
To incorporate "follow set" error recovery we could try something like this:
// +++++++++++++++++++++++++++++++ Parser +++++++++++++++++++++++++++++++++++ static void accept(int wantedSym, String errorMessage) { // Checks that lookahead token is wantedSym if (sym.kind == wantedSym) getSym(); else reportError(errorMessage); } static void test(IntSet allowed, IntSet beacons, String errorMessage) { if (allowed.contains(sym.kind)) return; reportError(errorMessage); IntSet stopSet = allowed.union(beacons); while (!stopSet.contains(sym.kind)) getSym(); } static IntSet FirstExpression = new IntSet(AtomSym, EscapedSym, LBrackSym, LParenSym), FirstElement = new IntSet(AtomSym, EscapedSym, LBrackSym, LParenSym), FirstFactor = new IntSet(AtomSym, EscapedSym, LBrackSym, LParenSym), FirstRestFactor = new IntSet(AtomSym, EscapedSym, LBrackSym, LParenSym, ConcatSym), FirstAtom = new IntSet(AtomSym, EscapedSym), FirstOneRange = new IntSet(AtomSym, EscapedSym), FirstRange = new IntSet(LBrackSym), Iterators = new IntSet(RepeatSym, OptionSym, PlusSym); static void RE (IntSet followers) { // RE == { Expression ";" } EOF . test(followers.union(FirstExpression), new IntSet(), "bad start to file"); // concession while (FirstExpression.contains(sym.kind)) { Expression(followers.union(new IntSet(SemicolonSym))); accept(SemicolonSym, "; expected"); } accept(EOFSym, "EOF expected"); } static void Expression (IntSet followers) { // Expression = Term { "|" Term } . Term(followers.union(new IntSet(BarSym))); while (sym.kind == BarSym) { getSym(); Term(followers.union(new IntSet(BarSym))); } } static void Term (IntSet followers) { // Term = Factor { [ "." ] Factor } . Factor(followers.union(FirstRestFactor)); while (FirstFactor.contains(sym.kind) || sym.kind == ConcatSym) { if (sym.kind == ConcatSym) getSym(); Factor(followers.union(FirstRestFactor)); } } static void Factor (IntSet followers) { // Factor == Element [ "*" | "?" | "+" ] . Element(followers.union(Iterators)); if (Iterators.contains(sym.kind)) getSym(); } static void Element (IntSet followers) { // Element == Atom | Range | "(" Expression ")" . test(FirstElement, followers, "Invalid Start to Element"); if (FirstAtom.contains(sym.kind)) Atom(followers); else if (sym.kind == LBrackSym) Range(followers); else if (sym.kind == LParenSym) { getSym(); Expression(followers.union(new IntSet(RParenSym))); accept(RParenSym, ") expected"); } test(followers, new IntSet(), "unexpected symbol"); } static void Range (IntSet followers) { // Range == "[" OneRange { OneRange } "]" . accept(LBrackSym, "[ expected"); OneRange(followers.union(new IntSet(RBrackSym))); while (FirstOneRange.contains(sym.kind)) OneRange(followers.union(new IntSet(RBrackSym))); accept(RBrackSym, "] expected"); } static void OneRange (IntSet followers) { // OneRange == Atom [ "-" Atom ] . Atom(followers.union(new IntSet(HyphenSym))); if (sym.kind == HyphenSym) { getSym(); Atom(followers); } } static void Atom (IntSet followers) { // Atom = atomic | escaped . // Note the use of a default option to mop up errors switch (sym.kind) { case AtomSym : getSym(); break; case EscapedSym : getSym(); break; default : reportError("Invalid Atom"); break; } } .... getChar(); // Lookahead character getSym(); // Lookahead symbol RE(new IntSet(EOFSym)); // Start to parse from the goal symbol if (!errors) System.out.println("Parsed correctly");