The code below demonstrates a hand written scanner and parser for an example grammar very similar to that discussed on page 131 of the text book. but extended to show some peculiarities of scanner construction needed to distinguish real and integer numbers and adjacent brackets.
Although this is a contrived example, similar problems arise in scanning in Pascal, where input like 3..4 (three tokens - integer number, ellipsis and integer number) and 3.4 (one token - real number) may have to be clearly distinguished. These can be resolved in the same sort of way.
Note that the scanner does not "push back" characters into the input stream, but handles the problem by having a small one symbol buffer instead.
You can get machine readable versions of this code in the file lect25.zip, which you may also find useful as a model for the code needed for completing Practical 24.
// Simple Recursive Descent Parser for the language defined by the grammar // COMPILER A // CHARACTERS // digit = "0123456789" . // letter = "abcdefgefghijklmnopqrstuvwxyz" . // TOKENS // number = digit { digit } [ "." digit { digit } ] . // identifier = letter { letter } . // PRODUCTIONS // A = B "." . // B = identifier | number | "(" C ")" | "(." B ".)" | . // C = B D . // D = { "+" B } . // END A. // // Note that there are two forms of numbers // These cause problems for the scanner, which are handled in an ad hoc // way, making use of a token buffer to avoid having to backtrack // // P.D. Terry, Rhodes University, 2000 #include "misc.h" #include "set.h" bool AtEndOfFile; // State of character handler bool haveSym; // State of lookahead in scanner // Declaring ch as a global variable is done for expediency - global variables // are not always a good thing char ch; // Lookahead character for scanner // Define token enumeration -- note EOFSym and unknownSym enum SYMBOLS { unknownSym, EOFSym, identSym, intSym, realSym, lbrackSym, rbrackSym, lparenSym, rparenSym, plusSym, periodSym}; // Declaring sym as a global variable is done for expediency - global variables // are not always a good thing SYMBOLS sym; // Lookahead token for parser SYMBOLS savedSym; // saved token in special cases // The <periodSym> in the next declaration ensures that the sets will all be // able to contain elements in the range unknownSym .. periodSym typedef Set<periodSym> SymSet; // Set type for FIRST and FOLLOW sets // First sets for later error recovery additions SymSet FirstA(identSym, intSym, realSym, lparenSym, lbrackSym); SymSet FirstB(identSym, intSym, realSym, lparenSym, lbrackSym); SymSet FirstC(identSym, intSym, realSym, lparenSym, lbrackSym); SymSet FirstD(plusSym); void Abort (char *S) { // Abandon parsing with Error message S printf("%s\n", S); fprintf(stderr, "Errors detected\n"); exit(EXIT_FAILURE); } // +++++++++++++++++++++++++ Character Handler +++++++++++++++++++++++++ void GetChar (void) { // Obtains next character ch from inFile, or CHR(0) if EOF reached if (AtEndOfFile) ch = '\0'; else { ch = getchar(); if (ch != EOF) putchar(ch); // copy to output file for posterity else { AtEndOfFile = true; ch = '\0'; } } } // +++++++++++++++++++++++++ Scanner +++++++++++++++++++++++++++++++++++ void GetSym (void) { // Scans for next sym from input source if (haveSym) // we have one buffered if the last token was numeric { sym = savedSym; haveSym = false; return; } while (ch > 0 && ch <= ' ') GetChar(); // skip over whitespace if (isalpha(ch)) // identifiers { while (isalpha(ch)) GetChar(); sym = identSym; return; } if (isdigit(ch)) // numbers - we may have to look further ahead { while (isdigit(ch)) GetChar(); if (ch != '.') { sym = intSym; return; } else // real number or integer followed by something else { GetChar(); if (isdigit(ch)) // it is a real number { while (isdigit(ch)) GetChar(); sym = realSym; return; } if (ch == ')' ) // it is an integer followed by .) { sym = intSym; savedSym = rbrackSym; haveSym = true; GetChar(); return; } else // it is an integer followed by period { sym = intSym; savedSym = periodSym; haveSym = true; return; } } } switch (ch) { // all other cases can be handled here case '\0' : sym = EOFSym; break; // no need to GetChar here, of course case '(' : GetChar(); if (ch == '.') { sym = lbrackSym; GetChar(); } else sym = lparenSym; break; case '.' : GetChar(); if (ch == ')') { sym = rbrackSym; GetChar(); } else sym = periodSym; break; case '+' : sym = plusSym; GetChar(); break; case ')' : sym = rparenSym; GetChar(); break; default : sym = unknownSym; GetChar(); break; // mop up all other funnies } } void Accept (SYMBOLS WantedSym, char *S) { // Check that lookahead token is WantedSym // Note that the parameter is of SYMBOLS type - not char or int if (sym == WantedSym) GetSym(); else Abort(S); } // +++++++++++++++++++++++++ Parser ++++++++++++++++++++++++++++++++++++ // use prototypes to handle recursion easily void A(void); void B(void); void C(void); void D(void); void A(void) // A = B "." . { B(); Accept(periodSym, " Error - '.' expected"); } void B(void) // B = identifier | integer | real | "(" C ")" | "(." B ".)" | . { switch (sym) { case identSym : case intSym : case realSym : GetSym(); break; case lparenSym : GetSym(); C(); Accept(rparenSym, " Error - ')' expected"); break; case lbrackSym : GetSym(); B(); Accept(rbrackSym, " Error - '.)' expected"); break; case rparenSym : case rbrackSym : case plusSym : case periodSym : break; // no action for followers of B default: printf("Unknown symbol %d\n", sym); exit(EXIT_FAILURE); } } void C(void) // C = B D . { B(); D(); } void D(void) // D = { "+" B } . { while (sym == plusSym) { GetSym(); B(); } } void main() { // Initialise character handler AtEndOfFile = false; GetChar(); // Initialise scanner haveSym = false; GetSym(); // Start from goal symbol parser A(); // If we get back here everything must have been okay fprintf(stderr, "Parsed correctly"); } </PRE> <HR Width="100%"> <H4>Turbo Pascal equivalent</H4> <PRE> PROGRAM Parser1; (* Simple Recursive Descent Parser for the language defined by the grammar COMPILER A CHARACTERS digit = "0123456789" . letter = "abcdefgefghijklmnopqrstuvwxyz" . TOKENS number = digit { digit } [ "." digit { digit } ] . identifier = letter { letter } . PRODUCTIONS A = B "." . B = identifier | number | "(" C ")" | "(." B ".)" | . C = B D . D = { "+" B } . END A. Note that there are two forms of numbers. These cause problems for the scanner, which are handled in an ad hoc way, making use of a token buffer to avoid having to backtrack P.D. Terry, Rhodes University, 2000 *) (* Define token enumeration -- note EOFSym and unknownSym *) TYPE SYMBOLS = ( unknownSym, EOFSym, identSym, intSym, realSym, lbrackSym, rbrackSym, lparenSym, rparenSym, plusSym, periodSym ); VAR (* Declaring CH and Sym as global variables is done for expediency - global variables are not always a good thing *) CH : CHAR; (* Lookahead character for scanner *) Sym : SYMBOLS; (* Lookahead token for parser *) savedSym : SYMBOLS; (* Saved token in special cases *) AtEndOfFile : BOOLEAN; (* State of character handler *) haveSym : BOOLEAN; (* State of lookahead in scanner *) FirstA, FirstB, FirstC, FirstD : SET OF SYMBOLS; PROCEDURE Abort (S : STRING); (* Abandon parsing with Error message S *) BEGIN WriteLn(S); WriteLn('Errors detected '); HALT END; (* +++++++++++++++++++++++++ Character Handler +++++++++++++++++++++++++ *) PROCEDURE GetChar; (* Obtains next character CH from InFile, or CHR(0) if EOF reached *) BEGIN IF AtEndOfFile THEN CH := CHR(0) ELSE BEGIN Read(CH); Write(CH); AtEndOfFile := EOF END END; (* +++++++++++++++++++++++++ Scanner +++++++++++++++++++++++++++++++++++ *) PROCEDURE GetSym; (* Scans for next Sym from standard input *) BEGIN (* We may have one already if the last token was numeric *) IF haveSym THEN BEGIN Sym := savedSym; haveSym := FALSE END ELSE BEGIN WHILE CH IN [CHR(1) .. CHR(32)] DO GetChar; (* skip over white space *) CASE CH OF CHR(0) : BEGIN Sym := EOFSym END; '(' : BEGIN GetChar; IF CH = '.' THEN BEGIN Sym := lbrackSym; GetChar END ELSE Sym := lparenSym END; '.' : BEGIN GetChar; IF CH = ')' THEN BEGIN Sym := rbrackSym; GetChar END ELSE Sym := periodSym END; '+' : BEGIN Sym := plusSym; GetChar END; ')' : BEGIN Sym := rparenSym; GetChar END; 'A' .. 'Z', 'a' .. 'z' : BEGIN WHILE CH IN ['A' .. 'Z', 'a' .. 'z'] DO GetChar; Sym := identSym; END; '0' .. '9' : BEGIN WHILE CH IN ['0' .. '9'] DO GetChar; IF CH <> '.' THEN Sym := intSym ELSE BEGIN (* real number or integer followed by something *) GetChar; IF CH IN ['0' .. '9'] THEN (* it is a real number *) BEGIN WHILE CH IN ['0' .. '9'] DO GetChar; Sym := realSym; END ELSE IF CH = ')' THEN BEGIN (* it is an integer followed by .) *) Sym := intSym; savedSym := rbrackSym; haveSym := TRUE; GetChar; END ELSE BEGIN (* it is an integer followed by a period *) Sym := intSym; savedSym := periodSym; haveSym := TRUE; END END END; ELSE (* mop up all other funnies *) BEGIN Sym := unKnownSym; GetChar END; END END END; PROCEDURE Accept (WantedSym : SYMBOLS; S : STRING); (* Check that lookahead token is WantedSym *) BEGIN IF Sym = WantedSym THEN GetSym ELSE Abort(S) END; (* +++++++++++++++++++++++++ Parser +++++++++++++++++++++++++++++++++++++ - declare all of these "forward" so as to handle recursion easily *) PROCEDURE A; FORWARD; PROCEDURE B; FORWARD; PROCEDURE C; FORWARD; PROCEDURE D; FORWARD; PROCEDURE A; (* A = B "." . *) BEGIN B; Accept(periodSym, ' Error - "." expected') END; PROCEDURE B; (* B = identifier | integer | real | "(" C ")" | "(." B ".)" | . *) BEGIN CASE Sym OF identSym, intSym, realSym : GetSYM; lparenSym : BEGIN GetSYM; C; Accept(rparenSym, ' Error - ")" expected') END; lbrackSym : BEGIN GetSYM; B; Accept(rbrackSym, ' Error - ".)" expected') END; rparenSym, rbrackSym, plusSym, periodSym : (* no action for followers of B *); ELSE BEGIN WriteLn('Unknown symbol'); HALT END END END; PROCEDURE C; (* C = B D . *) BEGIN B; D END; PROCEDURE D; (* D = { "+" B } . *) BEGIN WHILE Sym = plusSym DO BEGIN GetSYM; B END END; BEGIN (* Parser1 *) (* Initialise FIRST sets - used in later error recovery *) FirstA := [identSym, intSym, realSym, lparenSym, lbrackSym]; FirstB := [identSym, intSym, realSym, lparenSym, lbrackSym]; FirstC := [identSym, intSym, realSym, lparenSym, lbrackSym]; FirstD := [plusSym]; (* Initialise character handler *) AtEndOfFile := FALSE; GetChar; (* Initialise scanner *) haveSym := FALSE; GetSym; (* Start from goal symbol parser *) A; (* If we get back here everything must have been okay *) WriteLn('Successful'); END.