Computer Science 3 - 2001

Programming Language Translation


Practical for Week 18, beginning 20 August 2001

There were some outstandingly good solutions handed in, but several people had clearly not found time or put in the effort to complete the assignment. This showed up very clearly in the little prac test, which revealed, rather sadly and surprisingly, that there seemed to be some students who did not have the first clue about how to write a simple recursive descent parser for a tiny system of productions. Please study the solutions to the prac test carefully - they are all on the WWW pages for the course under the heading of "Prac Quizes".

Another point that I noticed was that many people, both in the test and in the prac, were driving their parsers from the back, so to speak. Given a construction like

A = { "start" Something } "follow" .

it is far better to produce a parser routine like

while (Sym == startSym) { GetSym(); Something(); } Accept(followSym);

than one like

while (Sym != followSym) { GetSym(); Something(); } Accept(followSym);

for the simple reason that there might be something wrong with Something!

Complete source code for three possible solutions to the prac are available on the WWW pages in files PRAC18A.ZIP.

The scanner was reasonably done by most people. However, only one group tried to handle the (unasked) problem of dealing with comments. Since this is an integral part of most scanners, study my solution below carefully and see how this could have been done:

  void GetSym (void) {
  // Scans for next SYMBOL value from inFile, store it in global Sym
    while (ch > 0 && ch <= ' ') GetChar();  // skip whitespace
    if (isdigit(ch)) {                      // number
      sym = numSym;
      do GetChar(); while (isdigit(ch));
    } else
    if (isalpha(ch) || ch == '_' ) {        // identifier or key word
      char str[100];
      int i = 0;
      sym = identSym;
      do { str[i++] = ch; GetChar(); } while (isalnum(ch) || ch == '_');
      str[i] = '\0';
      if (strcmp("int", str) == 0) sym = intSym;
      if (strcmp("char", str) == 0) sym = charSym;
      if (strcmp("bool", str) == 0) sym = boolSym;
      if (strcmp("void", str) == 0) sym = voidSym;
    } else
    switch (ch) {

  // comment handlers have to be carefully written - we must check for unclosed
  // comments, as well as making sure that we really do find a token.  Note the
  // recursive call to GetSym

      case '/' :
        GetChar();
        if (ch == '*') {
          GetChar();
          char lastCh;
          do {
            lastCh = ch;
            GetChar();
          } while (!(lastCh == '*' && ch == '/') && ch != '\0');
          if (ch == '\0')
            fprintf(outFile, "unclosed comment"); // sym will be EOFSym
          GetChar(); GetSym(); break;
        } else
        if (ch == '/') {
          do GetChar(); while (ch != '\n' && ch != '\0');
          if (ch == '\0')
            fprintf(outFile, "unclosed comment"); // sym will be EOFSym
          GetChar(); GetSym(); break;
        }
        else sym = unknownSym;
        break;
      case ';' :
        sym = semicolonSym; GetChar(); break;
      case '(' :
        sym = lparenSym; GetChar(); break;
      case '[' :
        sym = lbrackSym; GetChar(); break;
      case ')' :
        sym = rparenSym; GetChar(); break;
      case ']' :
        sym = rbrackSym; GetChar(); break;
      case '*' :
        sym = pointerSym; GetChar(); break;
      case ',' :
        sym = commaSym;   GetChar(); break;
      case '\0':
        sym = EOFSym; break;  // no need to GetChar here, of course
      default  :
        sym = unknownSym; GetChar(); break;
    }
  }

To work up towards the development of the complementary parser, here is most of a simple "sudden death" parser, devoid of error recovery:

  SymSet FirstCdecls   (intSym, charSym, boolSym, voidSym, EOFSym);
  SymSet FirstDecList  (intSym, charSym, boolSym, voidSym);
  SymSet FirstType     (intSym, charSym, boolSym, voidSym);
  SymSet FirstOneDecl  (pointerSym, identSym, lparenSym);
  SymSet FirstDirect   (identSym, lparenSym);
  SymSet FirstSuffix   (lbrackSym, lparenSym);
  SymSet FirstParams   (lparenSym);
  SymSet FirstOneParam (intSym, charSym, boolSym, voidSym);
  SymSet FirstArray    (lbrackSym);

  void abort (char *S) {
  // Abandon parsing with Error message S
    fprintf(outFile, "%s\n", S);
    fprintf(stderr, "Errors detected - see %s\n", outName);
  // Note that we must close files before we exit
    fclose(outFile); exit(EXIT_FAILURE);
  }

  void accept (SYMBOLS WantedSym, char *S) {
  // Check that lookahead token is WantedSym
  // Note that the parameter is of SYMBOLS type - not char or int
    if (sym == WantedSym) GetSym(); else abort(S);
  }

  void Cdecls (void) {
  // Cdecls = { DecList } EOF .
    while (FirstDecList.memb(sym)) DecList();
    accept(EOFSym, "EOF expected");
  }

  void DecList (void) {
  // DecList = Type OneDecl { "," OneDecl } ";"  .
    Type();
    OneDecl();
    while (sym == commaSym) { GetSym(); OneDecl(); }
    accept(semicolonSym, "semicolon expected");
  }

  void Type (void) {
  // Type = "int" | "void" | "bool" | "char" .
    if (FirstType.memb(sym)) GetSym(); else abort("invalid Type");
  }

  void OneDecl (void) {
  // OneDecl = "*" OneDecl | Direct .
    if (sym == pointerSym) { GetSym(); OneDecl(); }
    else Direct();
  }

  void Direct (void) {
  // Direct = ( ident | "(" OneDecl ")" ) [ Suffix ] .
    switch(sym) {
      case identSym:
        GetSym(); break;
      case lparenSym:
        GetSym(); OneDecl(); accept(rparenSym, ") expected"); break;
      default:
        abort("invalid start to Direct"); break;
    }
    if (FirstSuffix.memb(sym)) Suffix();
  }

  void Suffix (void) {
  // Suffix = Array { Array } | Params .
    if (FirstArray.memb(sym)) {
      Array();
      while (FirstArray.memb(sym)) Array();
    }
    else
      if (FirstParams.memb(sym)) Params();
    else abort("invalid start to suffix");
  }

  void Params (void) {
  // Params = "(" [ OneParam { "," OneParam } ] ")" .
    accept(lparenSym, "( expected");
    if (FirstOneParam.memb(sym)) {
      OneParam();
      while (sym == commaSym) {
        GetSym(); OneParam();
      }
    }
    accept(rparenSym, ") expected");
  }

  void OneParam (void) {
  // OneParam = Type [ OneDecl ] .
    Type();
    if (FirstOneDecl.memb(sym)) OneDecl();
  }

  void Array (void) {
  // Array = "[" [ number ] "]" .
    accept(lbrackSym, "[ expected");
    if (sym == numSym) GetSym();
    accept(rbrackSym, "] expected");
  }

To incorporate error recovery one can go the whole way of introducing calls to a test routine at the start and end of every subparser. To do this one needs to know the appropriate FIRST and FOLLOW sets. Most people who tried this calculated, or got Coco/R to calculate on their behalf, the complete set of followers for each non terminal. A problem with that approach is that the sets beome larger than one would like. The approach below shows how we can start with the followers of CDecls and then compute the additions to the follower set that we pass as a parameter to subsequent routines. But this is overly clumsy (read on for the next exciting instalment!)

  void test (SymSet allowed, SymSet beacons, char *S) {
  // Test whether current Sym is in Allowed, and recover if not
    if (allowed.memb(sym)) return;
    fprintf(outFile, "%s\n", S); errors = true;
    SymSet stopset = allowed + beacons;
    while (!stopset.memb(sym)) GetSym();
  }

  void Cdecls (SymSet followers) {
  // Cdecls = { DecList } EOF .
    test(FirstCdecls + followers, SymSet(), "bad start to file");
    while (FirstDecList.memb(sym))
      DecList(followers + FirstDecList + SymSet(EOFSym));
    accept(EOFSym, "EOF expected");
  }

  void DecList (SymSet followers) {
  // DecList = Type OneDecl { "," OneDecl } ";" .
    test(FirstDecList, followers, "invalid start to DecList");
    if (FirstDecList.memb(sym)) {
      Type(followers + FirstOneDecl);
      OneDecl(followers + SymSet(commaSym, semicolonSym));
      while (sym == commaSym) {
        GetSym(); OneDecl(followers + SymSet(commaSym, semicolonSym));
      }
      accept(semicolonSym, "semicolon expected");
    }
    test(followers, SymSet(), "DecList : unexpected symbol");
  }

  void Type (SymSet followers) {
  // Type = "int" | "void" | "bool" | "char" .
    test(FirstType, followers, "invalid start to Type");
    if (FirstType.memb(sym)) GetSym();
    test(followers, SymSet(), "Type : unexpected symbol");
  }

  void OneDecl (SymSet followers) {
  // OneDecl = "*" OneDecl | Direct .
    test(FirstOneDecl, followers, "invalid start to OneDecl");
    if (FirstOneDecl.memb(sym)) {
      if (sym == pointerSym) {
        GetSym(); OneDecl(followers);
      }
      else Direct(followers);
    }
    test(followers, SymSet(), "OneDecl : unexpected symbol");
  }

  void Direct (SymSet followers) {
  // Direct = ( ident | "(" OneDecl ")" ) [ Suffix ] .
    test(FirstDirect, FirstSuffix + followers, "invalid start to Direct");
    if (FirstDirect.memb(sym)) {
      switch(sym) {
        case identSym:
          GetSym(); break;
        case lparenSym:
          GetSym(); OneDecl(followers + SymSet(rparenSym));
          accept(rparenSym, ") expected"); break;
        default:
          break; // already handled
      }
    }
    if (FirstSuffix.memb(sym)) Suffix(followers);
    test(followers, SymSet(), "Direct : unexpected symbol");
  }

  void Suffix (SymSet followers) {
  // Suffix = Array { Array } | Params .
    test(FirstSuffix, followers, "invalid start to Suffix");
    if (FirstArray.memb(sym)) {
      Array(followers + FirstArray);
      while (FirstArray.memb(sym)) Array(followers + FirstArray);
    }
    else
      if (FirstParams.memb(sym)) Params(followers);
    test(followers, SymSet(), "Suffix : unexpected symbol");
  }

  void Params (SymSet followers) {
  // Params = "(" [ OneParam { "," OneParam } ] ")" .
    test(FirstParams, followers, "invalid start to Params");
    if (FirstParams.memb(sym)) {
      accept(lparenSym, "( expected");
      if (FirstOneParam.memb(sym)) {
        OneParam(followers + SymSet(commaSym, rparenSym));
        while (sym == commaSym) {
          GetSym(); OneParam(followers + SymSet(commaSym, rparenSym));
        }
      }
    }
    accept(rparenSym, ") expected");
    test(followers, SymSet(), "Params : unexpected symbol");
  }

  void OneParam (SymSet followers) {
  // OneParam = Type [ OneDecl ] .
    test(FirstOneParam, FirstOneDecl + followers, "invalid start to OneParam");
    if (FirstType.memb(sym)) Type(followers + FirstOneDecl);
    if (FirstOneDecl.memb(sym)) OneDecl(followers);
    test(followers, SymSet(), "OneParam : unexpected symbol");
  }

  void Array (SymSet followers) {
  // Array = "[" [ number ] "]" .
    test(FirstArray, followers, "invalid start to Array");
    if (FirstArray.memb(sym)) {
      accept(lbrackSym, "[ expected");
      if (sym == numSym) GetSym();
      accept(rbrackSym, "] expected");
    }
    test(followers, SymSet(), "Array : unexpected symbol");
  }

When one looks at the FIRST sets one should be struck by the fact that many of them are the same. After a bit of reflection on all this, one should be able to see that we can actually economise on (possibly slow and expensive) calls to the test function, simply by incorporating them only at certain critical points. The complete solution below shows how I suggest this problem might be solved. It is a bit crude in its positioning of error messages - several submissions were notably better than mine in this regard. (Incidentally, most years I get better solutions than my own - or in places anyway! It's always fun when that happens.)

  void reportError (char *S) {
  // Report error with message S
    fprintf(outFile, "%s\n", S); errors = true;
  }

  void accept (SYMBOLS WantedSym, char *S) {
  // Check that lookahead token is WantedSym
  // Note that the parameter is of SYMBOLS type - not char or int
    if (sym == WantedSym) GetSym(); else reportError(S);
  }

  void test (SymSet allowed, SymSet beacons, char *S) {
  // Test whether current Sym is in Allowed, and recover if not
    if (allowed.memb(sym)) return;
    reportError(S);
    SymSet stopset = allowed + beacons;
    while (!stopset.memb(sym)) GetSym();
  }

  void Cdecls (SymSet followers) {
  // Cdecls = { DecList } EOF .
  // First test allows us to skip any real rubbish at the beginning
    test(FirstCdecls + followers, SymSet(), "bad start to declarations");
    while (FirstDecList.memb(sym))
      DecList(followers + FirstDecList + SymSet(EOFSym));
    accept(EOFSym, "EOF expected");
  }

  void DecList (SymSet followers) {
  // DecList = Type OneDecl { "," OneDecl } ";" .
    Type(followers + FirstOneDecl);
    OneDecl(followers + SymSet(commaSym, semicolonSym));
    while (sym == commaSym) {
      GetSym(); OneDecl(followers + SymSet(commaSym, semicolonSym));
    }
    test(SymSet(semicolonSym), followers, "semicolon expected");
    if (sym == semicolonSym) GetSym();
  }

  void Type (SymSet followers) {
  // Type = "int" | "void" | "bool" | "char" .
    if (FirstType.memb(sym)) GetSym(); else reportError("invalid Type");
  }

  void OneDecl (SymSet followers) {
  // OneDecl = "*" OneDecl | Direct .
    test(FirstOneDecl, followers, "invalid Declaration");
    if (FirstOneDecl.memb(sym)) {
      if (sym == pointerSym) {
        GetSym(); OneDecl(followers);
      }
      else Direct(followers);
    }
    test(followers, SymSet(), "unexpected symbol");
  }

  void Direct (SymSet followers) {
  // Direct = ( ident | "(" OneDecl ")" ) [ Suffix ] .
    switch(sym) {
      case identSym:
        GetSym(); break;
      case lparenSym:
        GetSym(); OneDecl(followers + SymSet(rparenSym));
        accept(rparenSym, ") expected"); break;
      default:
        reportError("identifier or ( expected"); break;
    }
    if (FirstSuffix.memb(sym)) Suffix(followers);
    test(followers, SymSet(), "unexpected symbol");
  }

  void Suffix (SymSet followers) {
  // Suffix = Array { Array } | Params .
    if (FirstArray.memb(sym)) {
      Array(followers + FirstArray);
      while (FirstArray.memb(sym)) Array(followers + FirstArray);
    }
    else Params(followers);
  }

  void Params (SymSet followers) {
  // Params = "(" [ OneParam { "," OneParam } ] ")" .
    accept(lparenSym, "( expected");
    if (FirstOneParam.memb(sym)) {
      OneParam(followers + SymSet(commaSym, rparenSym));
      while (sym == commaSym) {
        GetSym(); OneParam(followers + SymSet(commaSym, rparenSym));
      }
    }
    accept(rparenSym, ") expected");
  }

  void OneParam (SymSet followers) {
  // OneParam = Type [ OneDecl ] .
    Type(followers + FirstOneDecl);
    if (FirstOneDecl.memb(sym)) OneDecl(followers);
    test(followers, SymSet(), "unexpected symbol in parameter declaration");
  }

  void Array (SymSet followers) {
  // Array = "[" [ number ] "]" .
    accept(lbrackSym, "[ expected");
    if (sym == numSym) GetSym();
    accept(rbrackSym, "] expected");
  }


Home  © P.D. Terry