Computer Science 3 - 2003

Programming Language Translation


Practical for Week 23, beginning 29 September 2003 - Solutions

There were some outstandingly good solutions handed in, but several people had clearly not found time or put in the effort to complete the assignment. Please study the solutions to the prac test carefully - they are all on the WWW pages for the course under the heading of "Prac Quizes".

Another point that I noticed was that some people were driving their parsers from the back, so to speak. Given a construction like

A = { "start" Something } "follow" .

it is far better to produce a parser routine like

while (sym.kind == startSym) { getSym(); Something(); } Accept(followSym);

than one like

while (sym.kind != followSym) { getSym(); Something(); } Accept(followSym);

for the simple reason that there might be something wrong with Something.

Complete source code for solutions to the prac - including the error recovery additions that you were asked to think about - are available on the WWW pages in the file PRAC23A.ZIP.

The scanner was reasonably done by most people. Comments are trickier than they look so if you did not get that section right study my solution below carefully and see how they should be handled. The trick is to be able to handle comments that ore never closed, comments that follow one another with no tokens in between them, and comments that look as they are about to finish, but then do not (like /* comment with * and / ). internally */

  static int literalKind(StringBuffer lex) {
    String s = lex.toString();
    if (s.equals("int")) return intSym;
    if (s.equals("char")) return charSym;
    if (s.equals("bool")) return boolSym;
    if (s.equals("void")) return voidSym;
    return identSym;
  }

  static boolean isIdentCh(char ch) {
    return Character.isLetter(ch) || (ch == '_');
  }

  static void getSym() {
  // Scans for next sym from input
    while (ch > EOF && ch <= ' ') getChar();
    if (ch == '/') {  // might be a comment
      getChar();
      if (ch == '*') {  /* style comments */
        getChar();
        char lastCh;
        do {
          lastCh = ch;
          getChar();
        } while (!(lastCh == '*' && ch == '/') && ch != EOF);
        if (ch == EOF)
          reportError("unclosed comment"); // sym will be EOFSym
        getChar(); getSym(); return;
      } else
      if (ch == '/') {  // style comments
        do getChar(); while (ch != '\n' && ch != EOF);
        if (ch == EOF)
          reportError("unclosed comment"); // sym will be EOFSym
        getChar(); getSym(); return;
      }
      else {
        sym = new Token(noSym, "/"); return;
      }
    }
    else {
      StringBuffer symLex = new StringBuffer();
      int symKind;
      if (isIdentCh(ch)) {
        do {
          symLex.append(ch); getChar();
        } while (isIdentCh(ch) || Character.isDigit(ch));
        symKind = literalKind(symLex);
      }
      else if (Character.isDigit(ch)) {
        do {
          symLex.append(ch); getChar();
        } while (Character.isDigit(ch));
        symKind = numSym;
      }
      else {
        symLex.append(ch);
        switch (ch) {
          case EOF:
            symLex = new StringBuffer("EOF");   // special representation
            symKind = EOFSym; break;            // no need to getChar here, of course
          case ';':
            symKind = semicolonSym; getChar(); break;
          case ',':
            symKind = commaSym; getChar(); break;
          case '(':
            symKind = lParenSym; getChar(); break;
          case '[':
            symKind = lBrackSym; getChar(); break;
          case ')':
            symKind = rParenSym; getChar(); break;
          case ']':
            symKind = rBrackSym; getChar(); break;
          case '*':
            symKind = pointerSym; getChar(); break;
          default :
            symKind = noSym; getChar(); break;
        }
      }
      sym = new Token(symKind, symLex.toString());
    }
  }

To work up towards the development of the complementary parser, here is most of a simple "sudden death" parser, devoid of error recovery:

  static SymSet FirstCdecls   = new SymSet(new int[] {intSym, charSym, boolSym, voidSym, EOFSym});
  static SymSet FirstDecList  = new SymSet(new int[] {intSym, charSym, boolSym, voidSym});
  static SymSet FirstType     = new SymSet(new int[] {intSym, charSym, boolSym, voidSym});
  static SymSet FirstOneDecl  = new SymSet(new int[] {pointerSym, identSym, lParenSym});
  static SymSet FirstDirect   = new SymSet(new int[] {identSym, lParenSym});
  static SymSet FirstSuffix   = new SymSet(new int[] {lBrackSym, lParenSym});
  static SymSet FirstParams   = new SymSet(new int[] {lParenSym});
  static SymSet FirstOneParam = new SymSet(new int[] {intSym, charSym, boolSym, voidSym});
  static SymSet FirstArray    = new SymSet(new int[] {lBrackSym});

  static void accept(int wantedSym, String errorMessage) {
  // Checks that lookahead token is wantedSym
    if (sym.kind == wantedSym) getSym(); else abort(errorMessage);
  }

  static void Cdecls() {
  // Cdecls = { DecList } EOF .
    while (FirstDecList.contains(sym.kind)) DecList();
    accept(EOFSym, "EOF expected");
  }

  static void DecList() {
  // DecList = Type OneDecl { "," OneDecl } ";"  .
    Type();
    OneDecl();
    while (sym.kind == commaSym) {
      getSym(); OneDecl();
    }
    accept(semicolonSym, "semicolon expected");
  }

  static void Type() {
  // Type = "int" | "void" | "bool" | "char" .
    if (FirstType.contains(sym.kind)) getSym();
    else abort("invalid Type");
  }

  static void OneDecl() {
  // OneDecl = "*" OneDecl | Direct .
    if (sym.kind == pointerSym) {
      getSym(); OneDecl();
    }
    else Direct();
  }

  static void Direct() {
  // Direct = ( ident | "(" OneDecl ")" ) [ Suffix ] .
    switch(sym.kind) {
      case identSym:
        getSym(); break;
      case lParenSym:
        getSym(); OneDecl(); accept(rParenSym, ") expected"); break;
      default: abort("invalid start to Direct"); break;
    }
    if (FirstSuffix.contains(sym.kind)) Suffix();
  }

  static void Suffix() {
  // Suffix = Array { Array } | Params .
    if (FirstArray.contains(sym.kind)) {
      Array();
      while (FirstArray.contains(sym.kind)) Array();
    }
    else
      if (FirstParams.contains(sym.kind)) Params();
    else abort("invalid start to suffix");
  }

  static void Params() {
  // Params = "(" [ OneParam { "," OneParam } ] ")" .
    accept(lParenSym, "( expected");
    if (FirstOneParam.contains(sym.kind)) {
      OneParam();
      while (sym.kind == commaSym) {
        getSym(); OneParam();
      }
    }
    accept(rParenSym, ") expected");
  }

  static void OneParam() {
  // OneParam = Type [ OneDecl ] .
    Type();
    if (FirstOneDecl.contains(sym.kind)) OneDecl();
  }

  static void Array() {
  // Array = "[" [ number ] "]" .
    accept(lBrackSym, "[ expected");
    if (sym.kind == numSym) getSym();
    accept(rBrackSym, "] expected");
  }

To incorporate error recovery one can go to the whole extent of introducing calls to a test routine at the start and end of every subparser. To do this one needs to know the appropriate FIRST and FOLLOW sets. A problem with that approach is that the sets beome larger than one would like. The approach below shows how we can start with the followers of CDecls and then compute the additions to the follower set that we pass as a parameter to subsequent routines. But this is overly clumsy (read on for the next exciting instalment!)

  static void test(SymSet allowed, SymSet beacons, String errorMessage) {
  // Test whether current Sym is in Allowed, and recover if not
    if (allowed.contains(sym.kind)) return;
    reportError(errorMessage);
    SymSet stopSet = allowed.union(beacons);
    while (!stopSet.contains(sym.kind)) getSym();
  }

  static void Cdecls(SymSet followers) {
  // Cdecls = { DecList } EOF .
    test(followers.union(FirstCdecls), new SymSet(), "bad start to file");
    while (FirstDecList.contains(sym.kind))
      DecList(followers.union(FirstDecList).union(new SymSet(new int[] {EOFSym})));
    accept(EOFSym, "EOF expected");
  }

  static void DecList(SymSet followers) {
  // DecList = Type OneDecl { "," OneDecl } ";"  .
    test(FirstDecList, followers, "invalid start to DecList");
    if (FirstDecList.contains(sym.kind)) {
      Type(followers.union(FirstOneDecl));
      OneDecl(followers.union(new SymSet(new int[] {commaSym, semicolonSym})));
      while (sym.kind == commaSym) {
        getSym();
        OneDecl(followers.union(new SymSet(new int[] {commaSym, semicolonSym})));
      }
      accept(semicolonSym, "semicolon expected");
    }
    test(followers, new SymSet(), "DecList : unexpected symbol");
  }

  static void Type(SymSet followers) {
  // Type = "int" | "void" | "bool" | "char" .
    test(FirstType, followers, "invalid start to Type");
    if (FirstType.contains(sym.kind)) getSym();
    test(followers, new SymSet(), "Type : unexpected symbol");
  }

  static void OneDecl(SymSet followers) {
  // OneDecl = "*" OneDecl | Direct .
    test(FirstOneDecl, followers, "invalid start to OneDecl");
    if (FirstOneDecl.contains(sym.kind)) {
      if (sym.kind == pointerSym) {
        getSym(); OneDecl(followers);
      }
      else Direct(followers);
    }
    test(followers, new SymSet(), "OneDecl : unexpected symbol");
  }

  static void Direct(SymSet followers) {
  // Direct = ( ident | "(" OneDecl ")" ) [ Suffix ] .
    test(FirstDirect, followers.union(FirstSuffix), "invalid start to Direct");
    if (FirstDirect.contains(sym.kind))
      switch(sym.kind) {
        case identSym:
          getSym(); break;
        case lParenSym:
          getSym();
          OneDecl(followers.union(new SymSet(new int[] {rParenSym})));
          accept(rParenSym, ") expected"); break;
        default: break; // already handled
      }
    if (FirstSuffix.contains(sym.kind)) Suffix(followers);
    test(followers, new SymSet(), "Direct : unexpected symbol");
  }

  static void Suffix(SymSet followers) {
  // Suffix = Array { Array } | Params .
    test(FirstSuffix, followers, "invalid start to Suffix");
    if (FirstArray.contains(sym.kind)) {
      Array(followers.union(FirstArray));
      while (FirstArray.contains(sym.kind)) Array(followers.union(FirstArray));
    }
    else
      if (FirstParams.contains(sym.kind)) Params(followers);
    test(followers, new SymSet(), "Suffix : unexpected symbol");
  }

  static void Params(SymSet followers) {
  // Params = "(" [ OneParam { "," OneParam } ] ")" .
    test(FirstParams, followers, "invalid start to Params");
    if (FirstParams.contains(sym.kind)) {
      accept(lParenSym, "( expected");
      if (FirstOneParam.contains(sym.kind)) {
        OneParam(followers.union(new SymSet(new int[] {commaSym, rParenSym})));
        while (sym.kind == commaSym) {
          getSym();
          OneParam(followers.union(new SymSet(new int[] {commaSym, rParenSym})));
        }
      }
    }
    accept(rParenSym, ") expected");
    test(followers, new SymSet(), "Params : unexpected symbol");
  }

  static void OneParam(SymSet followers) {
  // OneParam = Type [ OneDecl ] .
    test(FirstOneParam, followers.union(FirstOneDecl), "invalid start to OneParam");
    if (FirstType.contains(sym.kind)) Type(followers.union(FirstOneDecl));
    if (FirstOneDecl.contains(sym.kind)) OneDecl(followers);
    test(followers, new SymSet(), "OneParam : unexpected symbol");
  }

  static void Array(SymSet followers) {
  // Array = "[" [ number ] "]" .
    test(FirstArray, followers, "invalid start to Array");
    if (FirstArray.contains(sym.kind)) {
      accept(lBrackSym, "[ expected");
      if (sym.kind == numSym) getSym();
      accept(rBrackSym, "] expected");
    }
    test(followers, new SymSet(), "Array : unexpected symbol");
  }

When one looks at the FIRST sets one should be struck by the fact that many of them are the same. After a bit of reflection on all this, one should be able to see that we can actually economise on (possibly slow and expensive) calls to the test function, simply by incorporating them only at certain critical points. The complete solution below shows how I suggest this problem might be solved. It is a bit crude in its positioning of error messages.

  static void Cdecls(SymSet followers) {
  // Cdecls = { DecList } EOF .
  // First test allows us to skip any real rubbish at the beginning
    test(followers.union(FirstCdecls), new SymSet(), "bad start to declarations");
    while (FirstDecList.contains(sym.kind))
      DecList(followers.union(FirstDecList).union(new SymSet(new int[] {EOFSym})));
    accept(EOFSym, "EOF expected");
  }

  static void DecList(SymSet followers) {
  // DecList = Type OneDecl { "," OneDecl } ";"  .
    Type(followers.union(FirstOneDecl));
    OneDecl(followers.union(new SymSet(new int[] {commaSym, semicolonSym})));
    while (sym.kind == commaSym) {
      getSym();
      OneDecl(followers.union(new SymSet(new int[] {commaSym, semicolonSym})));
    }
    test(new SymSet(new int[] {semicolonSym}), followers, "semicolon expected");
    if (sym.kind == semicolonSym) getSym();
  }

  static void Type(SymSet followers) {
  // Type = "int" | "void" | "bool" | "char" .
    if (FirstType.contains(sym.kind)) getSym(); else reportError("invalid Type");
  }

  static void OneDecl(SymSet followers) {
  // OneDecl = "*" OneDecl | Direct .
    test(FirstOneDecl, followers, "invalid Declaration");
    if (FirstOneDecl.contains(sym.kind)) {
      if (sym.kind == pointerSym) {
        getSym(); OneDecl(followers);
      }
      else Direct(followers);
    }
    test(followers, new SymSet(), "unexpected symbol");
  }

  static void Direct(SymSet followers) {
  // Direct = ( ident | "(" OneDecl ")" ) [ Suffix ] .
    switch(sym.kind) {
      case identSym:
        getSym(); break;
      case lParenSym:
        getSym();
        OneDecl(followers.union(new SymSet(new int[] {rParenSym})));
        accept(rParenSym, ") expected"); break;
      default:
        reportError("identifier or ( expected"); break;
      }
    if (FirstSuffix.contains(sym.kind)) Suffix(followers);
    test(followers, new SymSet(), "unexpected symbol");
  }

  static void Suffix(SymSet followers) {
  // Suffix = Array { Array } | Params .
    if (FirstArray.contains(sym.kind)) {
      Array(followers.union(FirstArray));
      while (FirstArray.contains(sym.kind)) Array(followers.union(FirstArray));
    }
    else Params(followers);
  }

  static void Params(SymSet followers) {
  // Params = "(" [ OneParam { "," OneParam } ] ")" .
    accept(lParenSym, "( expected");
    if (FirstOneParam.contains(sym.kind)) {
      OneParam(followers.union(new SymSet(new int[] {commaSym, rParenSym})));
      while (sym.kind == commaSym) {
        getSym();
        OneParam(followers.union(new SymSet(new int[] {commaSym, rParenSym})));
      }
    }
    accept(rParenSym, ") expected");
  }

  static void OneParam(SymSet followers) {
  // OneParam = Type [ OneDecl ] .
    Type(followers.union(FirstOneDecl));
    if (FirstOneDecl.contains(sym.kind)) OneDecl(followers);
    test(followers, new SymSet(), "unexpected symbol in parameter declaration");
  }

  static void Array(SymSet followers) {
  // Array = "[" [ number ] "]" .
    accept(lBrackSym, "[ expected");
    if (sym.kind == numSym) getSym();
    accept(rBrackSym, "] expected");
  }

It could be improved still further by setting up a few more static sets, for example:

  static SymSet puncSet0 = new SymSet(new int[] {semicolonSym});
  static SymSet puncSet1 = new SymSet(new int[] {commaSym, semicolonSym});

  static void DecList(SymSet followers) {
  // DecList = Type OneDecl { "," OneDecl } ";"  .
    Type(followers.union(FirstOneDecl));
    OneDecl(followers.union(puncSet1));
    while (sym.kind == commaSym) {
      getSym();
      OneDecl(followers.union(puncSet1));
    }
    test(puncSet0, followers, "semicolon expected");
    if (sym.kind == semicolonSym) getSym();
  }


Home  © P.D. Terry