Computer Science 301 - 2000

Programming Language Translation


Example of a Scanner and Parser

The code below demonstrates a hand written scanner and parser for an example grammar very similar to that discussed on page 131 of the text book. but extended to show some peculiarities of scanner construction needed to distinguish real and integer numbers and adjacent brackets.

Although this is a contrived example, similar problems arise in scanning in Pascal, where input like 3..4 (three tokens - integer number, ellipsis and integer number) and 3.4 (one token - real number) may have to be clearly distinguished. These can be resolved in the same sort of way.

Note that the scanner does not "push back" characters into the input stream, but handles the problem by having a small one symbol buffer instead.

You can get machine readable versions of this code in the file lect25.zip, which you may also find useful as a model for the code needed for completing Practical 24.

C++ version (a Pascal version is given later on)

// Simple Recursive Descent Parser for the language defined by the grammar
// COMPILER A
//   CHARACTERS
//     digit  = "0123456789" .
//     letter = "abcdefgefghijklmnopqrstuvwxyz" .
//   TOKENS
//     number      = digit { digit } [ "." digit { digit } ] .
//     identifier  = letter { letter } .
//   PRODUCTIONS
//     A = B "." .
//     B = identifier | number | "(" C ")" | "(." B ".)" | .
//     C = B D .
//     D = { "+" B } .
//   END A.
//
// Note that there are two forms of numbers
// These cause problems for the scanner, which are handled in an ad hoc
// way, making use of a token buffer to avoid having to backtrack
//
// P.D. Terry,  Rhodes University, 2000

#include "misc.h"
#include "set.h"

bool AtEndOfFile;               // State of character handler
bool haveSym;                   // State of lookahead in scanner

// Declaring ch as a global variable is done for expediency - global variables
// are not always a good thing

char ch;                        // Lookahead character for scanner

// Define token enumeration -- note EOFSym and unknownSym

enum SYMBOLS { unknownSym, EOFSym, identSym, intSym, realSym, lbrackSym,
               rbrackSym, lparenSym, rparenSym, plusSym, periodSym};

// Declaring sym as a global variable is done for expediency - global variables
// are not always a good thing

SYMBOLS sym;                    // Lookahead token for parser
SYMBOLS savedSym;               // saved token in special cases

// The <periodSym> in the next declaration ensures that the sets will all be
// able to contain elements in the range unknownSym .. periodSym

typedef Set<periodSym> SymSet;  // Set type for FIRST and FOLLOW sets

// First sets for later error recovery additions

SymSet FirstA(identSym, intSym, realSym, lparenSym, lbrackSym);
SymSet FirstB(identSym, intSym, realSym, lparenSym, lbrackSym);
SymSet FirstC(identSym, intSym, realSym, lparenSym, lbrackSym);
SymSet FirstD(plusSym);

void Abort (char *S) {
// Abandon parsing with Error message S
  printf("%s\n", S);
  fprintf(stderr, "Errors detected\n");
  exit(EXIT_FAILURE);
}

// +++++++++++++++++++++++++ Character Handler +++++++++++++++++++++++++

void GetChar (void) {
// Obtains next character ch from inFile, or CHR(0) if EOF reached
  if (AtEndOfFile) ch = '\0';
  else {
    ch = getchar();
    if (ch != EOF) putchar(ch); // copy to output file for posterity
    else { AtEndOfFile = true; ch = '\0'; }
  }
}

// +++++++++++++++++++++++++ Scanner +++++++++++++++++++++++++++++++++++

void GetSym (void) {
// Scans for next sym from input source
  if (haveSym)            // we have one buffered if the last token was numeric
    { sym = savedSym; haveSym = false; return; }
  while (ch > 0 && ch <= ' ') GetChar(); // skip over whitespace
  if (isalpha(ch))       // identifiers
    { while (isalpha(ch)) GetChar(); sym = identSym; return; }
  if (isdigit(ch))       // numbers - we may have to look further ahead
    { while (isdigit(ch)) GetChar();
      if (ch != '.') { sym = intSym; return; }
      else               // real number or integer followed by something else
      { GetChar();
        if (isdigit(ch)) // it is a real number
          { while (isdigit(ch)) GetChar(); sym = realSym; return; }
        if (ch == ')' )  // it is an integer followed by .)
          { sym = intSym; savedSym = rbrackSym; haveSym = true; GetChar(); return; }
        else             // it is an integer followed by period
          { sym = intSym; savedSym = periodSym; haveSym = true; return; }
      }
    }
  switch (ch) {          // all other cases can be handled here
    case '\0' : sym = EOFSym; break;  // no need to GetChar here, of course
    case '('  : GetChar();
                if (ch == '.') { sym = lbrackSym; GetChar(); } else sym = lparenSym;
                break;
    case '.'  : GetChar();
                if (ch == ')') { sym = rbrackSym; GetChar(); } else sym = periodSym;
                break;
    case '+'  : sym = plusSym; GetChar(); break;
    case ')'  : sym = rparenSym; GetChar(); break;
    default   : sym = unknownSym; GetChar(); break; // mop up all other funnies
  }
}

void Accept (SYMBOLS WantedSym, char *S) {
// Check that lookahead token is WantedSym
// Note that the parameter is of SYMBOLS type - not char or int
  if (sym == WantedSym) GetSym(); else Abort(S);
}

// +++++++++++++++++++++++++ Parser ++++++++++++++++++++++++++++++++++++
// use prototypes to handle recursion easily

void A(void);
void B(void);
void C(void);
void D(void);

void A(void)
//  A = B "." .
{ B(); Accept(periodSym, " Error - '.' expected"); }

void B(void)
//  B  =  identifier | integer | real | "(" C ")"   |   "(." B ".)"   |   .
{ switch (sym)
  { case identSym :
    case intSym :
    case realSym :
      GetSym(); break;
    case lparenSym :
      GetSym(); C(); Accept(rparenSym, " Error - ')' expected"); break;
    case lbrackSym :
      GetSym(); B(); Accept(rbrackSym, " Error - '.)' expected"); break;
    case rparenSym :
    case rbrackSym :
    case plusSym :
    case periodSym :
      break;  // no action for followers of B
    default:
      printf("Unknown symbol %d\n", sym); exit(EXIT_FAILURE);
  }
}

void C(void)
//  C = B D .
{ B(); D(); }

void D(void)
//  D  =  { "+" B } .
{ while (sym == plusSym) { GetSym(); B(); } }

void main()
{
// Initialise character handler
  AtEndOfFile = false; GetChar();
// Initialise scanner
   haveSym = false; GetSym();
// Start from goal symbol parser
  A();
// If we get back here everything must have been okay
  fprintf(stderr, "Parsed correctly");
}
</PRE>

<HR Width="100%">

<H4>Turbo Pascal equivalent</H4>

<PRE>
PROGRAM Parser1;
(* Simple Recursive Descent Parser for the language defined by the grammar
   COMPILER A
     CHARACTERS
       digit  = "0123456789" .
       letter = "abcdefgefghijklmnopqrstuvwxyz" .
     TOKENS
       number      = digit { digit } [ "." digit { digit } ] .
       identifier  = letter { letter } .
     PRODUCTIONS
       A = B "." .
       B = identifier | number | "(" C ")" | "(." B ".)" | .
       C = B D .
       D = { "+" B } .
     END A.

   Note that there are two forms of numbers. These cause problems for the
   scanner, which are handled in an ad hoc way, making use of a token buffer
   to avoid having to backtrack
   P.D. Terry,  Rhodes University, 2000 *)

(* Define token enumeration -- note EOFSym and unknownSym *)

  TYPE
   SYMBOLS = ( unknownSym, EOFSym, identSym, intSym, realSym, lbrackSym,
               rbrackSym, lparenSym, rparenSym, plusSym, periodSym );
  VAR

(* Declaring CH and Sym as global variables is done for expediency - global
   variables are not always a good thing *)

    CH : CHAR;                 (* Lookahead character for scanner *)
    Sym : SYMBOLS;             (* Lookahead token for parser *)
    savedSym : SYMBOLS;        (* Saved token in special cases *)
    AtEndOfFile : BOOLEAN;     (* State of character handler *)
    haveSym : BOOLEAN;         (* State of lookahead in scanner *)

    FirstA, FirstB, FirstC, FirstD : SET OF SYMBOLS;

  PROCEDURE Abort (S : STRING);
  (* Abandon parsing with Error message S *)
    BEGIN
      WriteLn(S); WriteLn('Errors detected '); HALT
    END;

  (* +++++++++++++++++++++++++ Character Handler +++++++++++++++++++++++++ *)

  PROCEDURE GetChar;
  (* Obtains next character CH from InFile, or CHR(0) if EOF reached *)
    BEGIN
      IF AtEndOfFile
        THEN CH := CHR(0)
        ELSE BEGIN Read(CH); Write(CH); AtEndOfFile := EOF END
    END;

  (* +++++++++++++++++++++++++ Scanner +++++++++++++++++++++++++++++++++++ *)

  PROCEDURE GetSym;
  (* Scans for next Sym from standard input *)
    BEGIN
      (* We may have one already if the last token was numeric *)
      IF haveSym
        THEN BEGIN Sym := savedSym; haveSym := FALSE END
        ELSE BEGIN
          WHILE CH IN [CHR(1) .. CHR(32)] DO GetChar; (* skip over white space *)
            CASE CH OF
              CHR(0) :
                 BEGIN Sym := EOFSym END;
              '(' :
                 BEGIN GetChar;
                   IF CH = '.'
                     THEN BEGIN Sym := lbrackSym; GetChar END
                     ELSE Sym := lparenSym
                 END;
              '.' :
                 BEGIN GetChar;
                   IF CH = ')'
                     THEN BEGIN Sym := rbrackSym; GetChar END
                     ELSE Sym := periodSym
                 END;
              '+' :
                 BEGIN Sym := plusSym; GetChar END;
              ')' :
                 BEGIN Sym := rparenSym; GetChar END;
              'A' .. 'Z', 'a' .. 'z' :
                 BEGIN
                   WHILE CH IN ['A' .. 'Z', 'a' .. 'z'] DO GetChar; Sym := identSym;
                 END;
              '0' .. '9' :
                 BEGIN
                   WHILE CH IN ['0' .. '9'] DO GetChar;
                   IF CH <> '.'
                     THEN Sym := intSym
                     ELSE BEGIN (* real number or integer followed by something *)
                       GetChar;
                       IF CH IN ['0' .. '9']
                         THEN (* it is a real number *) BEGIN
                           WHILE CH IN ['0' .. '9'] DO GetChar; Sym := realSym;
                         END
                         ELSE IF CH = ')' THEN BEGIN (* it is an integer followed by .) *)
                           Sym := intSym; savedSym := rbrackSym;
                           haveSym := TRUE; GetChar;
                         END
                         ELSE BEGIN (* it is an integer followed by a period *)
                           Sym := intSym; savedSym := periodSym;
                           haveSym := TRUE;
                         END
                     END
                 END;
              ELSE (* mop up all other funnies *)
                BEGIN Sym := unKnownSym; GetChar END;
            END
      END
    END;

  PROCEDURE Accept (WantedSym : SYMBOLS; S : STRING);
  (* Check that lookahead token is WantedSym *)
    BEGIN
      IF Sym = WantedSym
        THEN GetSym
        ELSE Abort(S)
    END;

  (* +++++++++++++++++++++++++ Parser +++++++++++++++++++++++++++++++++++++
     - declare all of these "forward" so as to handle recursion easily *)

  PROCEDURE A; FORWARD;
  PROCEDURE B; FORWARD;
  PROCEDURE C; FORWARD;
  PROCEDURE D; FORWARD;

  PROCEDURE A;
  (*  A = B "." . *)
    BEGIN
      B; Accept(periodSym, ' Error - "." expected')
    END;

  PROCEDURE B;
  (*  B  =  identifier | integer | real | "(" C ")" | "(." B ".)" |  . *)
    BEGIN
      CASE Sym OF
        identSym, intSym, realSym :
          GetSYM;
        lparenSym :
          BEGIN GetSYM; C; Accept(rparenSym, ' Error - ")" expected') END;
        lbrackSym :
          BEGIN GetSYM; B; Accept(rbrackSym, ' Error - ".)" expected') END;
        rparenSym, rbrackSym, plusSym, periodSym :
          (* no action for followers of B *);
        ELSE BEGIN WriteLn('Unknown symbol'); HALT END
      END
    END;

  PROCEDURE C;
  (*  C = B D . *)
    BEGIN
      B; D
    END;

  PROCEDURE D;
  (*  D  =  { "+" B } . *)
    BEGIN
      WHILE Sym = plusSym DO BEGIN GetSYM; B END
    END;

  BEGIN (* Parser1 *)
  (* Initialise FIRST sets - used in later error recovery *)
    FirstA := [identSym, intSym, realSym, lparenSym, lbrackSym];
    FirstB := [identSym, intSym, realSym, lparenSym, lbrackSym];
    FirstC := [identSym, intSym, realSym, lparenSym, lbrackSym];
    FirstD := [plusSym];
  (* Initialise character handler *)
    AtEndOfFile := FALSE; GetChar;
  (* Initialise scanner *)
    haveSym := FALSE; GetSym;
  (* Start from goal symbol parser *)
    A;
  (* If we get back here everything must have been okay *)
    WriteLn('Successful');
  END.