There were some outstandingly good solutions handed in, but several people had clearly not found time or put in the effort to complete the assignment. This showed up very clearly in the little prac test, which revealed, rather sadly and surprisingly, that there seemed to be some students who did not have the first clue about how to write a simple recursive descent parser for a tiny system of productions. Please study the solutions to the prac test carefully - they are all on the WWW pages for the course unde the heading of "Prac Quizes".
Another point that I noticed was that many people, both in the test and in the prac, were driving their parsers from the back, so to speak. Given a construction like
A = { "start" Something } "follow" .
it is far better to produce a parser routine like
while (Sym == startSym) { GetSym(); Something(); } Accept(followSym);
than one like
while (Sym != followSym) { GetSym(); Something(); } Accept(followSym);
for the simple reason that there might be something wrong with Something!
Complete source code for three possible solutions to the prac are available on the WWW pages in files PRAC23CA.ZIP (C++ versions) and PRAC23PA.ZIP (Pascal versions).
One complete solution appears towards the end of this sheet. To work up towards that, here is a simple "sudden death" parser for regular expressions, devoid of error recovery:
1 void RE (void) { // RE == { Expression ";" } EOF . 2 while (FirstExpression.memb(sym)) { 3 Expression(); Accept(SemicolonSym, "; expected"); 4 } 5 Accept(EOFSym, "EOF expected"); 6 } 7 8 void Expression (void) { // Expression = Term { "|" Term } . 9 Term(); while (sym == BarSym) { Accept(BarSym, "| expected"); Term(); } 10 } 11 12 void Term (void) { // Term = Factor { [ "." ] Factor } . 13 Factor(); 14 while (FirstFactor.memb(sym) || sym == ConcatSym) { 15 if (sym == ConcatSym) GetSym(); Factor(); 16 } 17 } 18 19 void Factor (void) { // Factor == Element [ "*" | "?" | "+" ] . 20 Element(); if (Iterators.memb(sym)) GetSym(); 21 } 22 23 void Element (void) { // Element == Atom | Range | "(" Expression ")" . 24 if (FirstAtom.memb(sym)) Atom(); 25 else 26 if (sym == LBrackSym) Range(); 27 else 28 if (sym == LParenSym) { 29 GetSym(); Expression(); Accept(RParenSym, ") expected"); 30 } 31 else Abort("Invalid Start to Element"); 32 } 33 34 void Range (void) { // Range == "[" OneRange { OneRange } "]" . 35 Accept(LBrackSym, "[ expected"); 36 OneRange(); while (FirstOneRange.memb(sym)) OneRange(); 37 Accept(RBrackSym, "] expected"); 38 } 39 40 void OneRange (void) { // OneRange == Atom [ "-" Atom ] . 41 Atom(); if (sym == HyphenSym) { GetSym(); Atom(); } 42 } 43 44 void Atom (void) { // Atom == atomic | escaped . 45 // Note the use of a default option to mop up errors 46 switch (sym) { 47 case AtomSym : GetSym(); break; 48 case EscapedSym : GetSym(); break; 49 default : Abort("Invalid Atom"); break; 50 } 51 }
To incorporate error recovery one can go the whole way of introducing calls to a test routine at the start and end of every subparser. To do this one needs to know the appropriate FIRST and FOLLOW sets. Most people who tried this calculated, or got Coco/R to calculate on their behalf, the complete set of followers for each non terminal. A problem with that approach is that the sets beome larger than one would like. The approach below shows how we can start with the followers of RE and then compute the additions to the follower set that we pass as a parameter to subsequent routines. But this is overly clumsy (read on for the next exciting instalment!)
1 void RE (SymSet followers) { // RE == { Expression ";" } EOF . 2 test(FirstExpression + followers, SymSet(), "bad start to file"); 3 while (FirstExpression.memb(sym)) { 4 Expression(followers + SymSet(SemicolonSym)); 5 Accept(SemicolonSym, "; expected"); 6 } 7 Accept(EOFSym, "EOF expected"); 8 } 9 10 void Expression (SymSet followers) { // Expression = Term { "|" Term } . 11 test(FirstExpression, followers, "invalid start to Expression"); 12 if (FirstExpression.memb(sym)) { 13 Term(followers + SymSet(BarSym)); 14 while (sym == BarSym) { 15 Accept(BarSym, "| expected"); Term(followers + SymSet(BarSym)); 16 } 17 } 18 test(followers, SymSet(), "unexpected symbol"); 19 } 20 21 void Term (SymSet followers) { // Term = Factor { [ "." ] Factor } . 22 test(FirstTerm, followers, "invalid start to Term"); 23 if (FirstTerm.memb(sym)) { 24 Factor(followers + FirstRestFactor); 25 while (FirstRestFactor.memb(sym)) { 26 if (sym == ConcatSym) GetSym(); 27 Factor(followers + FirstRestFactor); 28 } 29 } 30 test(followers, SymSet(), "unexpected symbol"); 31 } 32 33 void Factor (SymSet followers) { // Factor == Element [ "*" | "?" | "+" ] . 34 test(FirstFactor, followers, "invalid start to Factor"); 35 if (FirstFactor.memb(sym)) { 36 Element(followers + Iterators); 37 if (Iterators.memb(sym)) GetSym(); 38 } 39 test(followers, SymSet(), "unexpected symbol"); 40 } 41 42 void Element (SymSet followers) { // Element == Atom | Range | "(" Expression ")" . 43 test(FirstElement, followers, "Invalid Start to Element"); 44 if (FirstAtom.memb(sym)) Atom(followers); 45 else 46 if (FirstRange.memb(sym)) Range(followers); 47 else 48 if (sym == LParenSym) { 49 GetSym(); Expression(followers + SymSet(RParenSym)); 50 Accept(RParenSym, ") expected"); 51 } 52 test(followers, SymSet(), "unexpected symbol"); 53 } 54 55 void Range (SymSet followers) { // Range == "[" OneRange { OneRange } "]" . 56 test(FirstRange, followers, "invalid start to Range"); 57 if (FirstRange.memb(sym)) { 58 Accept(LBrackSym, "[ expected"); 59 OneRange(followers + SymSet(RBrackSym)); 60 while (FirstOneRange.memb(sym)) OneRange(followers + SymSet(RBrackSym)); 61 Accept(RBrackSym, "] expected"); 62 } 63 test(followers, SymSet(), "unexpected symbol"); 64 } 65 66 void OneRange (SymSet followers) { // OneRange == Atom [ "-" Atom ] . 67 test(FirstOneRange, followers, "invalid start to OneRange"); 68 if (FirstOneRange.memb(sym)) { 69 Atom(followers + SymSet(HyphenSym)); 70 if (sym == HyphenSym) { GetSym(); Atom(followers); } 71 } 72 test(followers, SymSet(), "unexpected symbol"); 73 } 74 75 void Atom (SymSet followers) { // Atom == atomic | escaped . 76 // Note the use of a default option to mop up errors 77 test(FirstAtom, followers, "invalid start to Atom"); 78 if (FirstAtom.memb(sym)) { 79 switch (sym) { 80 case AtomSym : GetSym(); break; 81 case EscapedSym : GetSym(); break; 82 default : break; // already dealt with 83 } 84 } 85 test(followers, SymSet(), "invalid symbol"); 86 } 87
When one looks at the FIRST sets one should be struck by the fact that many of them are the same. After a bit of reflection on all this, one should be able to see that we can actually economise on (possibly slow and expensive) calls to the test function, simply by incorporating them at certain critical points. The complete solution below shows how I suggest this problem might be solved. It is a bit crude in its positioning of error messages - several submissions were notably better than mine in this regard. (Incidentally, most years I get better solutions than my own - or in places anyway! It's always fun when that happens.)
1 // RegExpParser 2 // Usage: RegExp DataFile [ ListFile ] 3 // Parse sequence of regular expressions in DataFile 4 // Incorporates streamlined error recovery 5 // P.D. Terry, Rhodes University, 2000 6 7 #include "misc.h" 8 #include "set.h" 9 10 char outName[256]; // string for results file name 11 FILE *inFile, *outFile; // Data and results 12 bool AtEndOfFile; // State of character handler 13 bool errors = false; // State of overall parse 14 15 // Declaring ch as a global variable is done for expediency - global variables 16 // are not always a good thing 17 18 char ch; // Lookahead character for scanner 19 20 // Define token enumeration 21 22 enum SYMBOLS { unknownSym, EOFSym, SemicolonSym, BarSym, LParenSym, LBrackSym, 23 RParenSym, RBrackSym, ConcatSym, OptionSym, RepeatSym, PlusSym, 24 HyphenSym, AtomSym, EscapedSym}; 25 26 // Declaring sym as a global variable is done for expediency - global variables 27 // are not always a good thing 28 29 SYMBOLS sym; // Lookahead token for parser 30 31 // The <EscapedSym> in the next declaration ensures that the sets will all be 32 // able to contain elements in the range EOFSym .. EscapedSym 33 34 typedef Set<EscapedSym> SymSet; // Set type for FIRST sets 35 36 SymSet FirstExpression(AtomSym, EscapedSym, LBrackSym, LParenSym); 37 SymSet FirstElement(AtomSym, EscapedSym, LBrackSym, LParenSym); 38 SymSet FirstFactor(AtomSym, EscapedSym, LBrackSym, LParenSym); 39 SymSet FirstRestFactor(AtomSym, EscapedSym, LBrackSym, LParenSym, ConcatSym); 40 SymSet FirstAtom(AtomSym, EscapedSym); 41 SymSet FirstOneRange(AtomSym, EscapedSym); 42 SymSet FirstRange(LBrackSym); 43 SymSet Iterators(RepeatSym, OptionSym, PlusSym); 44 45 void Abort (char *S) { 46 // Abandon parsing with Error message S 47 fprintf(outFile, "%s\n", S); 48 fprintf(stderr, "Errors detected - see %s\n", outName); 49 // Note that we must close files before we exit 50 fclose(outFile); 51 exit(1); 52 } 53 54 // +++++++++++++++++++++++++ Character Handler +++++++++++++++++++++++++ 55 56 void GetChar (void) { 57 // Obtains next character ch from inFile, or CHR(0) if EOF reached 58 if (AtEndOfFile) ch = '\0'; 59 else { 60 ch = fgetc(inFile); 61 if (ch != EOF) fputc(ch, outFile); 62 else { AtEndOfFile = true; ch = '\0'; } 63 } 64 } 65 66 // +++++++++++++++++++++++++ Scanner +++++++++++++++++++++++++ 67 68 void GetSym (void) { 69 // Scans for next sym from inFile 70 while (ch > 0 && ch <= ' ') GetChar(); 71 switch (ch) { 72 case '\0' : sym = EOFSym; break; // no need to GetChar here, of course 73 74 // comment handlers have to be carefully written - we must check for unclosed 75 // comments, as well as making sure that we really do find a token. Note the 76 // recursive call to GetSym 77 78 case '\\' : do GetChar(); while (ch != '\\' && ch != '\0'); 79 if (ch == '\0') 80 fprintf(outFile, "unclosed comment"); // sym will = EOFSym 81 GetChar(); GetSym(); break; 82 case ';' : sym = SemicolonSym; GetChar(); break; 83 case '|' : sym = BarSym; GetChar(); break; 84 case '(' : sym = LParenSym; GetChar(); break; 85 case '[' : sym = LBrackSym; GetChar(); break; 86 case ')' : sym = RParenSym; GetChar(); break; 87 case ']' : sym = RBrackSym; GetChar(); break; 88 case '.' : sym = ConcatSym; GetChar(); break; 89 case '?' : sym = OptionSym; GetChar(); break; 90 case '*' : sym = RepeatSym; GetChar(); break; 91 case '+' : sym = PlusSym; GetChar(); break; 92 case '-' : sym = HyphenSym; GetChar(); break; 93 94 // escaped characters need special treatment - we must check for matching 95 // quote marks 96 97 case '"' : 98 case '\'' : sym = EscapedSym; 99 char quote = ch; GetChar(); GetChar(); 100 if (ch != quote) { 101 fprintf(outFile, "Mismatched quotes"); sym = unknownSym; 102 } 103 GetChar(); 104 break; 105 106 // unusually, perhaps, the default option mops up all other possibilities as 107 // valid. Very often default handlers must mop up errors 108 109 default : sym = AtomSym; GetChar(); break; 110 } 111 } 112 113 void Accept (SYMBOLS WantedSym, char *S) { 114 // Check that lookahead token is WantedSym 115 // Note that the parameter is of SYMBOLS type - not char or int 116 if (sym == WantedSym) GetSym(); 117 else { fprintf(outFile, "%s\n", S); errors = true; } 118 } 119 120 void test(SymSet allowed, SymSet beacons, char *S) 121 // Test whether current Sym is in Allowed, and recover if not 122 { if (allowed.memb(sym)) return; 123 fprintf(outFile, "%s\n", S); errors = true; 124 SymSet stopset = allowed + beacons; 125 while (!stopset.memb(sym)) GetSym(); 126 } 127 128 129 // +++++++++++++++++++++++++ Parser +++++++++++++++++++++++++ 130 // use prototypes to handle recursion easily 131 132 void RE (SymSet followers); 133 void Expression (SymSet followers); 134 void Term (SymSet followers); 135 void Factor (SymSet followers); 136 void Element (SymSet followers); 137 void Range (SymSet followers); 138 void OneRange (SymSet followers); 139 void Atom (SymSet followers); 140 141 void RE (SymSet followers) { 142 // RE == { Expression ";" } EOF . 143 // Note that we drive the loop on a FirstExpression check, and not on 144 // while (sym != EOFSym) ... 145 // Parser routines should be driven positively on FIRST sets, not negatively 146 // on the use of FOLLOW sets 147 test(FirstExpression + followers, SymSet(), "bad start to file"); // concession 148 while (FirstExpression.memb(sym)) { 149 Expression(followers + SymSet(SemicolonSym)); 150 Accept(SemicolonSym, "; expected"); 151 } 152 Accept(EOFSym, "EOF expected"); 153 } 154 155 void Expression (SymSet followers) { 156 // Expression = Term { "|" Term } . 157 Term(followers + SymSet(BarSym)); 158 while (sym == BarSym) { 159 Accept(BarSym, "| expected"); Term(followers + SymSet(BarSym)); 160 } 161 } 162 163 void Term (SymSet followers) { 164 // Term = Factor { [ "." ] Factor } . 165 Factor(followers + FirstRestFactor); 166 while (FirstRestFactor.memb(sym)) { 167 if (sym == ConcatSym) GetSym(); 168 Factor(followers + FirstRestFactor); 169 } 170 } 171 172 void Factor (SymSet followers) { 173 // Factor == Element [ "*" | "?" | "+" ] . 174 Element(followers + Iterators); 175 if (Iterators.memb(sym)) GetSym(); 176 } 177 178 void Element (SymSet followers) { 179 // Element == Atom | Range | "(" Expression ")" . 180 test(FirstElement, followers, "Invalid Start to Element"); 181 if (FirstAtom.memb(sym)) Atom(followers); 182 else 183 if (FirstRange.memb(sym)) Range(followers); 184 else 185 if (sym == LParenSym) { 186 GetSym(); Expression(followers + SymSet(RParenSym)); 187 Accept(RParenSym, ") expected"); 188 } 189 test(followers, SymSet(), "unexpected symbol"); 190 } 191 192 void Range (SymSet followers) { 193 // Range == "[" OneRange { OneRange } "]" . 194 Accept(LBrackSym, "[ expected"); 195 OneRange(followers + SymSet(RBrackSym)); 196 while (FirstOneRange.memb(sym)) OneRange(followers + SymSet(RBrackSym)); 197 Accept(RBrackSym, "] expected"); 198 } 199 200 void OneRange (SymSet followers) { 201 // OneRange == Atom [ "-" Atom ] . 202 Atom(followers + SymSet(HyphenSym)); 203 if (sym == HyphenSym) { GetSym(); Atom(followers); } 204 } 205 206 void Atom (SymSet followers) { 207 // Atom == atomic | escaped . 208 // Note the use of a default option to mop up errors 209 switch (sym) { 210 case AtomSym : 211 GetSym(); break; 212 case EscapedSym : 213 GetSym(); break; 214 default : 215 fprintf(outFile, "invalid Atom"); break; 216 } 217 } 218 219 void main (int argc, char *argv[]) { 220 if (argc < 2) { 221 fprintf(stderr, "Usage: RegExp DataFile [Results]"); exit(1); 222 } 223 if ((inFile = fopen(argv[1], "r")) == NULL) { 224 fprintf(stderr, "Could not open %s", argv[1]); exit(1); 225 } 226 if (argc == 3) strcpy(outName, argv[2]); 227 else appendextension(argv[1], ".lst", outName); 228 if ((outFile = fopen(outName, "w")) == NULL) { 229 fprintf(stderr, "Could not open %s", outName); exit(1); 230 } 231 // Initialise character handler 232 AtEndOfFile = false; GetChar(); 233 // Initialise scanner 234 GetSym(); 235 // and start to parse 236 RE(SymSet(EOFSym)); 237 // if we get back here everything may have been okay 238 if (!errors) fprintf(stderr, "Parsed correctly"); 239 fclose(outFile); 240 } 241