Computer Science 3 - 2000

Programming Language Translation


Practical for Week 23, beginning 2 October 2000 - Solutions

There were some outstandingly good solutions handed in, but several people had clearly not found time or put in the effort to complete the assignment. This showed up very clearly in the little prac test, which revealed, rather sadly and surprisingly, that there seemed to be some students who did not have the first clue about how to write a simple recursive descent parser for a tiny system of productions. Please study the solutions to the prac test carefully - they are all on the WWW pages for the course unde the heading of "Prac Quizes".

Another point that I noticed was that many people, both in the test and in the prac, were driving their parsers from the back, so to speak. Given a construction like

A = { "start" Something } "follow" .

it is far better to produce a parser routine like

while (Sym == startSym) { GetSym(); Something(); } Accept(followSym);

than one like

while (Sym != followSym) { GetSym(); Something(); } Accept(followSym);

for the simple reason that there might be something wrong with Something!

Complete source code for three possible solutions to the prac are available on the WWW pages in files PRAC23CA.ZIP (C++ versions) and PRAC23PA.ZIP (Pascal versions).

One complete solution appears towards the end of this sheet. To work up towards that, here is a simple "sudden death" parser for regular expressions, devoid of error recovery:

   1   void RE (void) { // RE == { Expression ";" } EOF .
   2     while (FirstExpression.memb(sym)) {
   3       Expression(); Accept(SemicolonSym, "; expected");
   4     }
   5     Accept(EOFSym, "EOF expected");
   6   }
   7
   8   void Expression (void) { // Expression = Term { "|" Term } .
   9     Term(); while (sym == BarSym) { Accept(BarSym, "| expected"); Term(); }
  10   }
  11
  12   void Term (void) { // Term = Factor { [ "." ] Factor } .
  13     Factor();
  14     while (FirstFactor.memb(sym) || sym == ConcatSym) {
  15       if (sym == ConcatSym) GetSym(); Factor();
  16     }
  17   }
  18
  19   void Factor (void) { // Factor == Element [ "*" | "?" | "+" ] .
  20     Element(); if (Iterators.memb(sym)) GetSym();
  21   }
  22
  23   void Element (void) { // Element == Atom | Range | "(" Expression ")" .
  24     if (FirstAtom.memb(sym)) Atom();
  25     else
  26       if (sym == LBrackSym) Range();
  27       else
  28         if (sym == LParenSym) {
  29           GetSym(); Expression(); Accept(RParenSym, ") expected");
  30         }
  31         else Abort("Invalid Start to Element");
  32   }
  33
  34   void Range (void) { // Range == "[" OneRange { OneRange } "]" .
  35     Accept(LBrackSym, "[ expected");
  36     OneRange(); while (FirstOneRange.memb(sym)) OneRange();
  37     Accept(RBrackSym, "] expected");
  38   }
  39
  40   void OneRange (void) { // OneRange == Atom [ "-" Atom ] .
  41     Atom(); if (sym == HyphenSym) { GetSym(); Atom(); }
  42   }
  43
  44   void Atom (void) { // Atom == atomic | escaped .
  45   // Note the use of a default option to mop up errors
  46     switch (sym) {
  47       case AtomSym : GetSym(); break;
  48       case EscapedSym : GetSym(); break;
  49       default : Abort("Invalid Atom"); break;
  50     }
  51   }

To incorporate error recovery one can go the whole way of introducing calls to a test routine at the start and end of every subparser. To do this one needs to know the appropriate FIRST and FOLLOW sets. Most people who tried this calculated, or got Coco/R to calculate on their behalf, the complete set of followers for each non terminal. A problem with that approach is that the sets beome larger than one would like. The approach below shows how we can start with the followers of RE and then compute the additions to the follower set that we pass as a parameter to subsequent routines. But this is overly clumsy (read on for the next exciting instalment!)

   1   void RE (SymSet followers) { // RE == { Expression ";" } EOF .
   2     test(FirstExpression + followers, SymSet(), "bad start to file");
   3     while (FirstExpression.memb(sym)) {
   4       Expression(followers + SymSet(SemicolonSym));
   5       Accept(SemicolonSym, "; expected");
   6     }
   7     Accept(EOFSym, "EOF expected");
   8   }
   9
  10   void Expression (SymSet followers) {  // Expression = Term { "|" Term } .
  11     test(FirstExpression, followers, "invalid start to Expression");
  12     if (FirstExpression.memb(sym)) {
  13       Term(followers + SymSet(BarSym));
  14       while (sym == BarSym) {
  15         Accept(BarSym, "| expected"); Term(followers + SymSet(BarSym));
  16       }
  17     }
  18     test(followers, SymSet(), "unexpected symbol");
  19   }
  20
  21   void Term (SymSet followers) { // Term = Factor { [ "." ] Factor } .
  22     test(FirstTerm, followers, "invalid start to Term");
  23     if (FirstTerm.memb(sym)) {
  24       Factor(followers + FirstRestFactor);
  25       while (FirstRestFactor.memb(sym)) {
  26         if (sym == ConcatSym) GetSym();
  27         Factor(followers + FirstRestFactor);
  28       }
  29     }
  30     test(followers, SymSet(), "unexpected symbol");
  31   }
  32
  33   void Factor (SymSet followers) { // Factor == Element [ "*" | "?" | "+" ] .
  34     test(FirstFactor, followers, "invalid start to Factor");
  35     if (FirstFactor.memb(sym)) {
  36       Element(followers + Iterators);
  37       if (Iterators.memb(sym)) GetSym();
  38     }
  39     test(followers, SymSet(), "unexpected symbol");
  40   }
  41
  42   void Element (SymSet followers) { // Element == Atom | Range | "(" Expression ")" .
  43     test(FirstElement, followers, "Invalid Start to Element");
  44     if (FirstAtom.memb(sym)) Atom(followers);
  45     else
  46       if (FirstRange.memb(sym)) Range(followers);
  47       else
  48         if (sym == LParenSym) {
  49           GetSym(); Expression(followers + SymSet(RParenSym));
  50           Accept(RParenSym, ") expected");
  51         }
  52     test(followers, SymSet(), "unexpected symbol");
  53   }
  54
  55   void Range (SymSet followers) { // Range == "[" OneRange { OneRange } "]" .
  56     test(FirstRange, followers, "invalid start to Range");
  57     if (FirstRange.memb(sym)) {
  58       Accept(LBrackSym, "[ expected");
  59       OneRange(followers + SymSet(RBrackSym));
  60       while (FirstOneRange.memb(sym)) OneRange(followers + SymSet(RBrackSym));
  61       Accept(RBrackSym, "] expected");
  62     }
  63     test(followers, SymSet(), "unexpected symbol");
  64   }
  65
  66   void OneRange (SymSet followers) { // OneRange == Atom [ "-" Atom ] .
  67     test(FirstOneRange, followers, "invalid start to OneRange");
  68     if (FirstOneRange.memb(sym)) {
  69       Atom(followers + SymSet(HyphenSym));
  70       if (sym == HyphenSym) { GetSym(); Atom(followers); }
  71     }
  72     test(followers, SymSet(), "unexpected symbol");
  73   }
  74
  75   void Atom (SymSet followers) { // Atom == atomic | escaped .
  76   // Note the use of a default option to mop up errors
  77     test(FirstAtom, followers, "invalid start to Atom");
  78     if (FirstAtom.memb(sym)) {
  79       switch (sym) {
  80         case AtomSym :    GetSym(); break;
  81         case EscapedSym : GetSym(); break;
  82         default :         break; // already dealt with
  83       }
  84     }
  85     test(followers, SymSet(), "invalid symbol");
  86   }
  87

When one looks at the FIRST sets one should be struck by the fact that many of them are the same. After a bit of reflection on all this, one should be able to see that we can actually economise on (possibly slow and expensive) calls to the test function, simply by incorporating them at certain critical points. The complete solution below shows how I suggest this problem might be solved. It is a bit crude in its positioning of error messages - several submissions were notably better than mine in this regard. (Incidentally, most years I get better solutions than my own - or in places anyway! It's always fun when that happens.)

   1   // RegExpParser
   2   // Usage: RegExp DataFile [ ListFile ]
   3   // Parse sequence of regular expressions in DataFile
   4   // Incorporates streamlined error recovery
   5   // P.D. Terry, Rhodes University, 2000
   6
   7   #include "misc.h"
   8   #include "set.h"
   9
  10   char outName[256];              // string for results file name
  11   FILE *inFile, *outFile;         // Data and results
  12   bool AtEndOfFile;               // State of character handler
  13   bool errors = false;            // State of overall parse
  14
  15   // Declaring ch as a global variable is done for expediency - global variables
  16   // are not always a good thing
  17
  18   char ch;                        // Lookahead character for scanner
  19
  20   // Define token enumeration
  21
  22   enum SYMBOLS { unknownSym, EOFSym, SemicolonSym, BarSym, LParenSym, LBrackSym,
  23                  RParenSym, RBrackSym, ConcatSym, OptionSym, RepeatSym, PlusSym,
  24                  HyphenSym, AtomSym, EscapedSym};
  25
  26   // Declaring sym as a global variable is done for expediency - global variables
  27   // are not always a good thing
  28
  29   SYMBOLS sym;                    // Lookahead token for parser
  30
  31   // The <EscapedSym> in the next declaration ensures that the sets will all be
  32   // able to contain elements in the range EOFSym .. EscapedSym
  33
  34   typedef Set<EscapedSym> SymSet; // Set type for FIRST sets
  35
  36   SymSet FirstExpression(AtomSym, EscapedSym, LBrackSym, LParenSym);
  37   SymSet FirstElement(AtomSym, EscapedSym, LBrackSym, LParenSym);
  38   SymSet FirstFactor(AtomSym, EscapedSym, LBrackSym, LParenSym);
  39   SymSet FirstRestFactor(AtomSym, EscapedSym, LBrackSym, LParenSym, ConcatSym);
  40   SymSet FirstAtom(AtomSym, EscapedSym);
  41   SymSet FirstOneRange(AtomSym, EscapedSym);
  42   SymSet FirstRange(LBrackSym);
  43   SymSet Iterators(RepeatSym, OptionSym, PlusSym);
  44
  45   void Abort (char *S) {
  46   // Abandon parsing with Error message S
  47     fprintf(outFile, "%s\n", S);
  48     fprintf(stderr, "Errors detected - see %s\n", outName);
  49   // Note that we must close files before we exit
  50     fclose(outFile);
  51     exit(1);
  52   }
  53
  54   // +++++++++++++++++++++++++ Character Handler +++++++++++++++++++++++++
  55
  56   void GetChar (void) {
  57   // Obtains next character ch from inFile, or CHR(0) if EOF reached
  58     if (AtEndOfFile) ch = '\0';
  59     else {
  60       ch = fgetc(inFile);
  61       if (ch != EOF) fputc(ch, outFile);
  62       else { AtEndOfFile = true; ch = '\0'; }
  63     }
  64   }
  65
  66   // +++++++++++++++++++++++++ Scanner +++++++++++++++++++++++++
  67
  68   void GetSym (void) {
  69   // Scans for next sym from inFile
  70     while (ch > 0 && ch <= ' ') GetChar();
  71     switch (ch) {
  72       case '\0' : sym = EOFSym; break;  // no need to GetChar here, of course
  73
  74   // comment handlers have to be carefully written - we must check for unclosed
  75   // comments, as well as making sure that we really do find a token.  Note the
  76   // recursive call to GetSym
  77
  78       case '\\' : do GetChar(); while (ch != '\\' && ch != '\0');
  79                   if (ch == '\0')
  80                     fprintf(outFile, "unclosed comment"); // sym will = EOFSym
  81                   GetChar(); GetSym(); break;
  82       case ';'  : sym = SemicolonSym; GetChar(); break;
  83       case '|'  : sym = BarSym;    GetChar(); break;
  84       case '('  : sym = LParenSym; GetChar(); break;
  85       case '['  : sym = LBrackSym; GetChar(); break;
  86       case ')'  : sym = RParenSym; GetChar(); break;
  87       case ']'  : sym = RBrackSym; GetChar(); break;
  88       case '.'  : sym = ConcatSym; GetChar(); break;
  89       case '?'  : sym = OptionSym; GetChar(); break;
  90       case '*'  : sym = RepeatSym; GetChar(); break;
  91       case '+'  : sym = PlusSym;   GetChar(); break;
  92       case '-'  : sym = HyphenSym; GetChar(); break;
  93
  94   // escaped characters need special treatment  - we must check for matching
  95   // quote marks
  96
  97       case '"'  :
  98       case '\'' : sym = EscapedSym;
  99                   char quote = ch; GetChar(); GetChar();
 100                   if (ch != quote) {
 101                     fprintf(outFile, "Mismatched quotes"); sym = unknownSym;
 102                   }
 103                   GetChar();
 104                   break;
 105
 106   // unusually, perhaps, the default option mops up all other possibilities as
 107   // valid.  Very often default handlers must mop up errors
 108
 109       default : sym = AtomSym; GetChar(); break;
 110     }
 111   }
 112
 113   void Accept (SYMBOLS WantedSym, char *S) {
 114   // Check that lookahead token is WantedSym
 115   // Note that the parameter is of SYMBOLS type - not char or int
 116     if (sym == WantedSym) GetSym();
 117     else { fprintf(outFile, "%s\n", S); errors = true; }
 118   }
 119
 120   void test(SymSet allowed, SymSet beacons, char *S)
 121   // Test whether current Sym is in Allowed, and recover if not
 122   { if (allowed.memb(sym)) return;
 123     fprintf(outFile, "%s\n", S); errors = true;
 124     SymSet stopset = allowed + beacons;
 125     while (!stopset.memb(sym)) GetSym();
 126   }
 127
 128
 129   // +++++++++++++++++++++++++ Parser +++++++++++++++++++++++++
 130   // use prototypes to handle recursion easily
 131
 132   void RE (SymSet followers);
 133   void Expression (SymSet followers);
 134   void Term (SymSet followers);
 135   void Factor (SymSet followers);
 136   void Element (SymSet followers);
 137   void Range (SymSet followers);
 138   void OneRange (SymSet followers);
 139   void Atom (SymSet followers);
 140
 141   void RE (SymSet followers) {
 142   // RE == { Expression ";" } EOF .
 143   // Note that we drive the loop on a FirstExpression check, and not on
 144   //    while (sym != EOFSym) ...
 145   // Parser routines should be driven positively on FIRST sets, not negatively
 146   // on the use of FOLLOW sets
 147     test(FirstExpression + followers, SymSet(), "bad start to file"); // concession
 148     while (FirstExpression.memb(sym)) {
 149       Expression(followers + SymSet(SemicolonSym));
 150       Accept(SemicolonSym, "; expected");
 151     }
 152     Accept(EOFSym, "EOF expected");
 153   }
 154
 155   void Expression (SymSet followers) {
 156   // Expression = Term { "|" Term } .
 157     Term(followers + SymSet(BarSym));
 158     while (sym == BarSym) {
 159       Accept(BarSym, "| expected"); Term(followers + SymSet(BarSym));
 160     }
 161   }
 162
 163   void Term (SymSet followers) {
 164   // Term = Factor { [ "." ] Factor } .
 165     Factor(followers + FirstRestFactor);
 166     while (FirstRestFactor.memb(sym)) {
 167       if (sym == ConcatSym) GetSym();
 168       Factor(followers + FirstRestFactor);
 169     }
 170   }
 171
 172   void Factor (SymSet followers) {
 173   // Factor == Element [ "*" | "?" | "+" ] .
 174     Element(followers + Iterators);
 175     if (Iterators.memb(sym)) GetSym();
 176   }
 177
 178   void Element (SymSet followers) {
 179   // Element == Atom | Range | "(" Expression ")" .
 180     test(FirstElement, followers, "Invalid Start to Element");
 181     if (FirstAtom.memb(sym)) Atom(followers);
 182     else
 183       if (FirstRange.memb(sym)) Range(followers);
 184       else
 185         if (sym == LParenSym) {
 186           GetSym(); Expression(followers + SymSet(RParenSym));
 187           Accept(RParenSym, ") expected");
 188         }
 189     test(followers, SymSet(), "unexpected symbol");
 190   }
 191
 192   void Range (SymSet followers) {
 193   // Range == "[" OneRange { OneRange } "]" .
 194     Accept(LBrackSym, "[ expected");
 195     OneRange(followers + SymSet(RBrackSym));
 196     while (FirstOneRange.memb(sym)) OneRange(followers + SymSet(RBrackSym));
 197     Accept(RBrackSym, "] expected");
 198   }
 199
 200   void OneRange (SymSet followers) {
 201   // OneRange == Atom [ "-" Atom ] .
 202     Atom(followers + SymSet(HyphenSym));
 203     if (sym == HyphenSym) { GetSym(); Atom(followers); }
 204   }
 205
 206   void Atom (SymSet followers) {
 207   // Atom == atomic | escaped .
 208   // Note the use of a default option to mop up errors
 209     switch (sym) {
 210       case AtomSym :
 211         GetSym(); break;
 212       case EscapedSym :
 213         GetSym(); break;
 214       default :
 215         fprintf(outFile, "invalid Atom"); break;
 216     }
 217   }
 218
 219   void main (int argc, char *argv[])  {
 220     if (argc < 2) {
 221       fprintf(stderr, "Usage: RegExp DataFile [Results]"); exit(1);
 222     }
 223     if ((inFile = fopen(argv[1], "r")) == NULL) {
 224       fprintf(stderr, "Could not open %s", argv[1]); exit(1);
 225     }
 226     if (argc == 3) strcpy(outName, argv[2]);
 227     else appendextension(argv[1], ".lst", outName);
 228     if ((outFile = fopen(outName, "w")) == NULL) {
 229       fprintf(stderr, "Could not open %s", outName); exit(1);
 230     }
 231   // Initialise character handler
 232     AtEndOfFile = false; GetChar();
 233   // Initialise scanner
 234     GetSym();
 235   // and start to parse
 236     RE(SymSet(EOFSym));
 237   // if we get back here everything may have been okay
 238     if (!errors) fprintf(stderr, "Parsed correctly");
 239     fclose(outFile);
 240   }
 241