Computer Science 301 - 2001

Programming Language Translation

Practical for Week 15, beginning 30 July 2001 - Solutions

These problems were all designed to heighten your understanding of simple file and table handling in C++. Some of them were rather well done, but a great many submissions showed a great tendency towards "hacked" code - far from elegant, full of mistakes, and quite obviously written without first analysing the problem, or designing a solution carefully. If you look at the suggested solutions below you will surely agree that they are nearly all very short and simple; none of the first five should have taken more than 15 minutes to develop. You can find all this source code in the file PRAC15A.ZIP if you want to study it further.

Task 1 simply required a program to copy standard input to standard output. This, at least, caused no great problem.

  #include "misc.h"

  void main (void) {
  // copy stdin to stdout
  // Normal usage:  TASK1 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    char ch = getchar();
    while (ch != EOF) {
      putchar(ch);
      ch = getchar();
    }
  }

The remaining problems all called for programs that could take their input either from standard input (sending output to standard output), or from a file specified as a command line parameter. However, many submissions offered no choice. In spite of devoting a tutorial to file handling, some people did not bother to check that the file opening process would "work". Finally, the most awful hacks were submitted by people who simply cut and pasted large sections of their code to give two slightly different versions of the same thing. That, frankly, is almost unforgivable! By this stage in your development you should not be tempted to commit such crimes.

In my solutions, misc.h has been renamed to misc1.h and extended to have the following extra routines useful in all the solutions thereafter (The prac sheet told you to do this, but sadly, nobody read the prac sheet, it appears!)

   void openfiles (FILE *&infile, FILE *&outfile, char *prompt,
                   int argc, char *inName, char *ext, char *outName) {
   // check correct command line usage - very wise to do this
     if (argc > 2) {
       fprintf(stderr, "Usage: %s infile\n", prompt); exit(EXIT_FAILURE);
     }
     if (argc == 1) {
       infile = stdin; outfile = stdout; outName = "stdout"; return;
     }
   // attempt to open input file and abort program if we cannot do so
     infile = fopen(inName, "r");
     if (infile == NULL) {
       fprintf(stderr, "could not open %s\n", inName); exit(EXIT_FAILURE);
     }
   // attempt to open output file - first derive name
     appendextension(inName, ext, outName);
     outfile = fopen(outName, "w");
     if (outfile == NULL) {
       fprintf(stderr, "could not open %s\n", outName); exit(EXIT_FAILURE);
     }
   }

   void closefiles (FILE *infile, FILE *outfile, char *inName, char *outName) {
   // close output file and check that this happened properly
     if (outfile != stdout) {
       if (fclose(outfile) == EOF) {
         fprintf(stderr, "could not close %s\n", outName); exit(EXIT_FAILURE);
       }
       fprintf(stderr, "File %s copied to %s\n", inName, outName);
     }
   }

With the routines above available, Task 2 became a simple variation on Task 1:

  #include "misc1.h"

  void main (int argc, char *argv[]) {
  // copy input file to input.NEW
  // Usage: TASK2 input   or  TASK2 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    char outName[256];
    FILE *infile, *outfile;

    openfiles(infile, outfile, "TASK2", argc, argv[1], "NEW", outName);

  // copy file character by character
    char ch = getc(infile);
    while (ch != EOF) {
      putc(ch, outfile);
      ch = getc(infile);
    }

    closefiles(infile, outfile, argv[1], outName);
  }

Task 3 required that the program be modified to determine the length of the file copied, and report this (on stderr, where it will be easily seen, but will not mess up the output file proper) . To solve this properly requires that you be aware that on WinTel machines, textfiles are stored with two bytes (CR + LF) marking the ends of lines. So to do a byte count requires that you take this into account. Unix systems have only one byte to mark line ends, so it makes sense to try to write portable source code by making use of a #if preprocessor directive. The rest of the code is a trivial extension to that used in Task 2:

  #include "misc1.h"

  void main (int argc, char *argv[]) {
  // copy input file to input.NEW and count bytes
  // Usage: TASK3 input   or  TASK3 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    int bytes = 0;
    char outName[256];
    FILE *infile, *outfile;

    openfiles(infile, outfile, "TASK3", argc, argv[1], "NEW", outName);

  // copy file character by character
    char ch = getc(infile);
    while (ch != EOF) {
      bytes++;
  #if __MSDOS__ || MSDOS || WIN32 || __WIN32__
      if (ch == '\n') bytes++; // handle CR-LF in DOS format
  #endif
      putc(ch, outfile);
      ch = getc(infile);
    }

    closefiles(infile, outfile, argv[1], outName);
    fprintf(stderr, "%d bytes copied\n", bytes);
  }

Task 4 asked that you write a program to copy a file line by line, rather than character by character. This is easy to do unless you fall into the silly trap of not allocating storage for the string needed for the current line. In spite of many exhortations to the contrary, some poor souls wrote code like

     char *str;

instead of

     const int MAXLINE = 1000;
     char str[MAXLINE];

or, perhaps

     const int MAXLINE = 1000;
     char *str = new char[MAXLINE];

If you insist on not declaring/allocating storage for your strings you will end up in a hell of a mess sooner or later.

  #include "misc1.h"

  void main (int argc, char *argv[]) {
  // copy input file to input.NEW
  // Usage: TASK4 input   or  TASK4 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    const int MAXLINE = 1000;
    char str[MAXLINE];
    char outName[256];
    FILE *infile, *outfile;

    openfiles(infile, outfile, "TASK4", argc, argv[1], "NEW", outName);

  // copy file line by line
    while (fgets(str, MAXLINE, infile) != NULL) {
      fputs(str, outfile);
    }

    closefiles(infile, outfile, argv[1], outName);

  }

Task 5 required that you write a program that read a file line by line and then made a copy in which the tabs had beeen expanded to spaces. This can be done in various ways, but a point many missed was that the number of spaces required depends on how many characters have been written to an output line already, and not on how many characters have been scanned from an input line:

  #include "misc1.h"

  void main (int argc, char *argv[]) {
  // copy input file to input.NEW, expanding tabs
  // Usage: TASK5 input   or  TASK5 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    const int MAXLINE = 1000; // hopefully long enough for most text files
    const int TABWIDTH = 8;   // defined this way for ease of modification
    char str[MAXLINE];
    char outName[256];
    FILE *infile, *outfile;

    openfiles(infile, outfile, "TASK5", argc, argv[1], "NEW", outName);

  // copy file line by line, expanding tabs
    while (fgets(str, MAXLINE, infile) != NULL) {

  // we compute the length of the line each time, but only once per line
      int l = strlen(str);

  // we have to keep track of how many characters have been written to the output
      int charsWritten = 0;

  // now we scan through the input line, character by character
      for (int i = 0; i < l; i++) {
        if (str[i] == '\t') {
          int spaces = TABWIDTH - charsWritten % TABWIDTH;
          fprintf(outfile, "%*c", spaces, ' ');
          charsWritten += spaces;
  //
  //  Here is an alternative way of doing it, avoiding the clever printf call
  //        while (spaces > 0) {
  //          putc(' ', outfile); spaces--; charsWritten++;
  //        }
        }
        else {
          putc(str[i], outfile);
          charsWritten++;
        }
      }
    }

    closefiles(infile, outfile, argv[1], outName);

  }

Task 6 required that comments be stripped from a C++ source file. This is really rather easy to do if you just think it out carefully. We need to keep a two-character "window" onto the input file, so as to be able to handle nasty cases like

        /* this comment has a * in the middle or even ** or *** before the end */
        /* this comment has a / in the middle *//* and another immediately */

Note carefully how the solution has been decomposed. The main function remains very simple - just like those in Tasks 2 and 3 - and devolves the main logic to an auxiliary routine.

  #include "misc1.h"

  void checkcomment (FILE * infile, FILE *outfile, char ch) {
  // precondition:  ch = '/'
  // postcondition: if ch is not the start of a comment
  //                  then it and its successor have been written to outfile
  //                  else comment has been skipped
    char nextch = getc(infile);
    if (nextch == EOF) {             // disaster strikes
      putc(ch, outfile); return;     // copy the single valid one and get out
    }
    if (nextch == '/') {             // we must handle // comments
      do ch = getc(infile); while (ch != '\n' && !feof(infile));
      if (!feof(infile)) putc('\n', outfile);
    }
    else if (nextch == '*') {        // we must handle /* comments */
      ch = getc(infile); nextch = getc(infile);
      while (!(ch == '*' && nextch == '/') & !feof(infile)) {
        ch = nextch; nextch = getc(infile);
      }
    }
    else {                           // no comment - copy both characters read
      putc(ch, outfile); putc(nextch, outfile);
    }
  }

  void main (int argc, char *argv[]) {
  // copy input C++ file to form input.CCC, deleting comments
  // Usage: TASK6 input   or  TASK6 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    char outName[256];
    FILE *infile, *outfile;

    openfiles(infile, outfile, "TASK6", argc, argv[1], "CCC", outName);

  // copy file character by character
    char ch = getc(infile);
    while (ch != EOF) {
      if (ch == '/') checkcomment(infile, outfile, ch); else putc(ch, outfile);
      ch = getc(infile);
    }

    closefiles(infile, outfile, argv[1], outName);

  }

This solution is still quite naive, and would not work for all programs. Can you think of a source program that it would not handle correctly? If you cannot, then come to discuss it with me.

Task 7 required a program that would read in a Clang source program and beautify it by converting the keywords into UPPER CASE throughout, and spelling all other words in the manner in which they had first been introduced.

This clearly required a lot more thought than most people appeared to give it. I'm afraid I have little sympathy for people who claim to have spent ten hours on it and still got nowhere - they should think for one hour and save themselves seven - please resist the temptation simply to rush in and hack!

The underlying idea is simply to build a list of words, suitably initialised with the keywords in UPPER CASE, and add to it as needed. But care must be taken to use the string handling primitives properly, and to allocate space for the strings.

Note also the level of commentary in the code that follows - many of you submit programs with next to no commentary, which is unacceptable.

  #include "misc1.h"

  const int MAX = 500;        // Limit on size of table
  static int Size = 18;       // Dynamic size of table (entries actually used )
  char Table[500][20] = {     // First 18 entries are reserved words
    "BEGIN",     "COBEGIN",  "COEND",      "CONST",    "DO",    "END",
    "FUNCTION",  "IF",       "PROCEDURE",  "PROGRAM",  "READ",  "RETURN",
    "SIGNAL",    "THEN",     "VAR",        "WAIT",     "WHILE", "WRITE"
  };

  static void updateTable (char *Name, bool &success) {
  // Search for Name among the entries in Table by performing a case-insensitive
  // search.  If the Name cannot be found, attempt to add it to the Table.
  // If it is already present, replace Name with the spelling found in Table.
  // success is returned as an indication of whether the action succeeded
    if (Size >= MAX) {                    // Take care not to exceed the Table
      success = false; return;
    }
    strcpy(Table[Size], Name);            // copy the name properly as a sentinel.
    int i = 0;                            // start at the bottom
    while (stricmp(Table[i], Name) != 0)
      i++;                                // - and we must find it somewhere!
    if (i == Size) {                      // if it was not in the original table
      Size++;                             // attempt to add it
      success = Size < MAX;
    }
    else strcpy(Name, Table[i]);          // else it was there already, so
                                          // retrieve the extant spelling
  }

  // It is always a very good idea to give names to enumerations, rather than
  // simply to use "magic numbers"

  typedef enum { noToken, wordToken, otherToken } tokens;

  tokens fgetnextstr (char *str, FILE *stream) {
  // Reads next string from stream into str.
  // Strings are of two kinds; the return value of the function
  //         distinguishes them from one another:
  // Returns wordToken if the string is a valid identifier or keyword (consists
  //         of an initial letter followed by other letters and digits only)
  // Returns otherToken if the string does not start with a letter,
  //         and contains no letters
  // Returns noToken if the stream is exhausted
  //         (no further string could be extracted)
    char ch = fgetc(stream);              // first character of token
    if (ch == EOF)                        // check that there really is one to find
      return(noToken);
  // The code below shows an array approach to manipulating the string
    int i = 0;                            // array index
    if (isalpha(ch)) {                    // then we must have a keyword or identifier
      do {                                // repeat
        str[i++] = ch;                    //   copy this character of the word to the buffer
        ch = fgetc(stream);               //   get the next character
      }
      while (ch != EOF && isalnum(ch));   // until we get past the end of the word
      str[i] = '\0';                      // add trailing nul character as usual
      ungetc(ch, stream);                 // the last character read is the first one of
                                          // the next token, so poke it back
      return wordToken;                   // return indication of word
    }
    else if (ch == '\'') {                // we must have a string
      do {                                // repeat
        str[i++] = ch;                    //   copy this character to the buffer
        ch = fgetc(stream);               //   get the next character
      } while (ch != EOF && ch != '\'');  // until we get to the end of the string
      str[i] = '\'';                      // add trailing quote character
      str[++i] = '\0';                    // add trailing nul character as usual
      return otherToken;                  // return indication of non word
    }
    else if (ch == '(') {                 // we might have a comment
      str[i++] = '(';                     // store the ( character
      ch = fgetc(stream);                 // get the next character
      if (ch != '*') {                    // we do not have a comment after all
        str[1] = '\0';                    // add trailing null charactr
        ungetc(ch, stream);               // the last character read is the first one of
                                          // the next token, so poke it back
        return otherToken;                // return indication of non-word
      }
      str[i++] = ch;                      // we do have a comment - so far stored (*
      ch = fgetc(stream);                 // now store the text of the comment
      do {                                // repeat
        str[i++] = ch;                    //   copy this character to the buffer
        ch = fgetc(stream);               //   get the next character
      } while (ch != EOF && str[i-1] != '*' && ch != ')');
                                          // until we get to the end of the comment
      str[i++] = ch;                      // copy the ) character to the buffer
      str[i] = '\0';                      // add trailing nul character as usual
      return otherToken;                  // return indication of non word
    }
    else {                                // we must have a boring token
      do {                                // repeat
        str[i++] = ch;                    //   copy this character of the non-word to buffer
        ch = fgetc(stream);               //   get the next character
      } while (ch != EOF && !isalpha(ch) && ch != '\'' && ch != '(');
                                          // until we get to the start of the next token
      str[i] = '\0';                      // add trailing nul character as usual
      ungetc(ch, stream);                 // the last character read is the first one of
                                          // the next token, so poke it back
      return otherToken;                  // return indication of non-word
    }
  }

  void main (int argc, char *argv[]) {
  // Beautify Clang program by forcing keywords into upper case, and ensuring all
  // other identifiers appear as they were first introduced
  // Usage: TASK7 input   or  TASK7 <infile >outfile
  // P.D. Terry, Rhodes University, 2001
    FILE *infile, *outfile;
    char outName[256], str[2000];
    bool okay;
    tokens nextToken;

    openfiles(infile, outfile, "TASK7", argc, argv[1], "NEW", outName);

  // Read the Clang file token by token
    nextToken = fgetnextstr(str, infile);
    while (nextToken != noToken) {
      switch (nextToken) {
        case wordToken:
          updateTable(str, okay); fprintf(outfile, "%s", str);
          if (!okay) {
            fprintf(stderr, "compiler error\n"); exit(EXIT_FAILURE);
          }
          break;
        case otherToken:
          fprintf(outfile, "%s", str); break;
        default:
          fprintf(stderr, "compiler error\n"); exit(EXIT_FAILURE);
      }
      nextToken = fgetnextstr(str, infile);
    }

    closefiles(infile, outfile, argv[1], outName);

  }

This program leaves comments and strings in the Clang source as they were. A simple version of the program - as the original problem challenged you to find - could have made use of a simpler version of the fgetnextstr routine as follows (note that this has the same "interface" as the more complicated version).

  tokens fgetnextstr (char *str, FILE *stream) {
  // Reads next string from stream into str.
  // Strings are of two kinds; the return value of the function
  //         distinguishes them from one another:
  // Returns wordToken if the string is a valid identifier or keyword (consists
  //         of an initial letter followed by other letters and digits only)
  // Returns otherToken if the string does not start with a letter,
  //         and contains no letters
  // Returns noToken if the stream is exhausted
  //         (no further string could be extracted)
    char ch = fgetc(stream);              // first character of token
    if (ch == EOF)                        // check that there really is one to find
      return(noToken);
  // The code below shows an array approach to manipulating the string
    int i = 0;                            // array index
    if (isalpha(ch)) {                    // then we must have a keyword or identifier
      do {                                // repeat
        str[i++] = ch;                    //   copy this character of the word to the buffer
        ch = fgetc(stream);               //   get the next character
      }
      while (ch != EOF && isalnum(ch));   // until we get past the end of the word
      str[i] = '\0';                      // add trailing nul character as usual
      ungetc(ch, stream);                 // the last character read is the first one of
                                          // the next token, so poke it back
      return wordToken;                   // return indication of word
    }
    else {                                // we must have a boring token
      do {                                // repeat
        str[i++] = ch;                    //   copy this character of the non-word to buffer
        ch = fgetc(stream);               //   get the next character
      } while (ch != EOF && !isalpha(ch));// until we get to the start of the next token
      str[i] = '\0';                      // add trailing nul character as usual
      ungetc(ch, stream);                 // the last character read is the first one of
                                          // the next token, so poke it back
      return otherToken;                  // return indication of non-word
    }
  }