// // EG. Generate random strings from a context-free grammar. // // Copyright (C) 2005 James Moen. // // This program is free software; you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation; either version 2 of the License, or (at your option) // any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for // more details. // // You should have received a copy of the GNU General Public License along // with this program; if not, write to the Free Software Foundation, Inc., 59 // Temple Place, Suite 330, Boston, MA 02111-1307 USA. // // // EG stands for Exempli Gratia or maybe Electric Grammar. It reads a context- // free grammar from a file, randomly generates example strings from it, then // writes the strings to standard output. EG can help design grammars, it can // generate test data for parsers, and it can produce "funny" nonsense prose. // // The EG command line looks like this. All parameters are optional. See MAIN // below for defaults. // // eg -d D -l L -n N -r R -s S F // // D is the number of levels the grammar will be descended while strings are // generated. L is the maximum length of the strings to be generated. N is the // number of strings to be generated. R is the maximum number of times that a // nonterminal will be repeated. S is the integer seed for the random number // generator. F is the name of the file that contains the grammar. // // Briefly, EG's syntax for grammars looks like this. CHAR, DIGIT, and LETTER // have the obvious meanings. Comments, which begin with #'s and extend to the // ends of lines, may appear wherever white space may appear. // // bar = "|". close = ")". equal = "=". plus = "+". // blank = " ". dot = ".". open = "(". quote = "\"". // comma = ",". empty = "". option = "?". star = "*". // // name = letter | name letter | name digit. // number = digit | digit number. // string = quote chars quote. // chars = empty | char chars. // // grammar = production | production grammar. // production = name equal disjunction dot. // disjunction = conjunction | conjunction bar disjunction. // conjunction = term | term blank conjunction. // term = name postfix | string postfix. // postfix = empty | option | plus | star | // open number close | open number comma number close. // // As in BNF, NAMEs are nonterminal symbols, and STRINGs are terminal symbols. // Adjacency means concatenation, and "|" means alternation. If X is a name or // or a string, then X(L) means X is repeated exactly L times. X(L, H) means X // is repeated at least L times and at most H times. X? means X(0, 1). If R is // defined by the "-r" option, then X* means X(0, R) and X+ means X(1, R). // // If you have questions or comments about EG, write to moen@augsburg.edu. // #include #include #include #include #include #include #include #include #define arrowChar '^' // It looks like an uparrow. #define eofChar EOF // End Of File char. #define eogChar 0x01 // End Of Grammar char. #define eolChar '\n' // End Of Line char. #define eosChar 0x00 // End Of String char. #define false 0x00 // A fake Boolean constant. #define isNameChar isalnum // Because it's ugly. #define isDigit isdigit // Because it's ugly. #define label jmp_buf // A JMP_BUF is really a label. #define lineLength 80 // Length of lines from SOURCE. #define lineNumberLength 5 // Digits in a line number. #define maxChar '~' // Max char allowed in SOURCE. #define maxInt INT_MAX // Max INT. #define maxRandom RAND_MAX // Max value returned by RAND. #define minChar ' ' // Min char allowed in SOURCE. #define nil NULL // Because it's ugly. #define outOfRange ERANGE // Because it's ugly. #define true 0x01 // A fake Boolean constant. #define vaArg va_arg // Because it's ugly. #define vaEnd va_end // Because it's ugly. #define vaList va_list // Because it's ugly. #define vaStart va_start // Because it's ugly. typedef char bool; // A Boolean. typedef struct headerStruct headerNode; // A HEADER node. typedef struct linkStruct linkNode; // A LINK node. typedef struct nameStruct nameNode; // A NAME node. typedef struct stringStruct stringNode; // A STRING node. typedef char *refChar; // Ref to a CHAR. typedef void (*refCode)(); // Ref to a VOID function. typedef FILE *refFile; // Ref to a FILE. typedef linkNode *refGeneric; // Ref to a LINK. typedef headerNode *refHeader; // Ref to a HEADER. typedef int *refInt; // Ref to an INT. typedef linkNode *refLink; // Ref to a LINK. typedef nameNode *refName; // Ref to a NAME. typedef char **refRefChar; // Ref to a ref to a CHAR. typedef stringNode *refString; // Ref to a STRING. #define toDouble(form) ((double) form) // Cast to DOUBLE. #define toInt(form) ((int) form) // Cast to INT. #define toRefGeneric(form) ((refLink) form) // Cast to REF LINK. #define toRefHeader(form) ((refHeader) form) // Cast to REF HEADER. #define toRefName(form) ((refName) form) // Cast to REF NAME. #define toRefString(form) ((refString) form) // Cast to REF FORM. // ERROR. Syntax errors. MAX ERR must be no greater than the number of bits in // an INT. Each error must have a matching string in MESSAGE. enum { minErr, badCharErr, badEscapeErr, badNumberErr, badTokenErr, charExpectedErr, closeParenExpectedErr, dotExpectedErr, endExpectedErr, equalExpectedErr, lineTooLongErr, nameExpectedErr, numberExpectedErr, quoteExpectedErr, termExpectedErr, undefinedNameErr, maxErr }; refChar message[maxErr + 1] = { " ", // minErr "Illegal character.", // badCharErr "This character cannot follow `\\'.", // badEscapeErr "Number is out of range.", // badNumberErr "Illegal symbol.", // badTokenErr "Character expected.", // charExpectedErr "`)' expected.", // closeParenExpectedErr "`.' expected.", // dotExpectedErr "End of file expected.", // endExpectedErr "`=' expected.", // equalExpectedErr "Source line is too long.", // lineTooLongErr "Name expected.", // nameExpectedErr "Nonnegative integer expected.", // numberExpectedErr "`\"' expected.", // QuoteExpectedErr "Name or string expected.", // termExpectedErr "Name `%s' was never defined.", // undefinedNameErr " " }; // maxErr // TOKEN. Source tokens. enum { minToken, barToken, blankToken, closeParenToken, commaToken, dotToken, endToken, equalToken, nameToken, numberToken, openParenToken, plusToken, questionToken, starToken, stringToken, maxToken }; // TYPE. Identify structure types. These must fit in a CHAR. enum { minType, andType, linkType, nameType, orType, rangeType, stringType, maxType }; // HEADER. Represent a conjunction, disjunction, or range (depending on TYPE). // If it's a conjunction, then FORM is a LINKed list of names or strings. If // it's a disjunction, then FORM is a LINKed list of UPPER conjunctions. If // it's a range, then FORM is a conjunction, disjunction, NAME, or STRING that // will be repeated at least LOWER times and at most UPPER times. struct headerStruct { char type; long lower; long upper; refGeneric form; }; // LINK. Represent a node in a linked list. FIRST is the form at the beginning // of the list, and REST is the rest of the list. By default, GENERIC pointers // reference LINKs. struct linkStruct { char type; refGeneric first; refGeneric rest; }; // NAME. Represent a nonterminal called STRING whose definition is FORM. NAMEs // are kept in an unbalanced binary search tree, indexed by their STRINGs. The // subtrees of NAMEs with STRINGs less than STRING are in LEFT, and those with // STRINGs greater than STRING are in RIGHT. struct nameStruct { char type; refChar string; refGeneric form; refName left; refName right; }; // STRING. Represent a terminal called STRING. struct stringStruct { char type; refChar string; }; int allErrs; // Bitset of all ERRORs ever encountered. refChar bufferEnd; // Last char in the string buffer. refChar bufferStart; // Next available char in the string buffer. char ch; // Last char copied from LINE. int depth; // Current recursion depth. refName dummyName; // A name that the user can't type. refGeneric emptyString; // The empty string "". int errs[lineLength]; // Bitsets of errors corresponding to LINE. refInt errsEnd; // Where to enqueue sets in ERRS. refInt errsStart; // Where to dequeue sets from ERRS. char line[lineLength]; // Last line read from SOURCE. int lineCount; // Count lines read from SOURCE. refChar lineEnd; // Where to enqueue chars in LINE. refChar lineStart; // Where to dequeue chars from LINE. int maxDepth; // Max recursion depth during generation. int maxLength; // Max length of generated strings. int maxRepeats; // Max times to repeat a nonterminal. refName nameTree; // Unbalanced binary search tree of NAMEs. int seed; // Random number seed. refChar self; // Name of this program, probably "eg". refFile source; // Current source file. refName startName; // A NAME whose STRING is "start". label stopGenerating; // LONGJMP here to halt generation. int token; // Last token read from LINE. long tokenNumber; // LONG (if any) corresponding to TOKEN. refGeneric tokenForm; // Form (if any) corresponding to TOKEN. // FAIL. Write an error message to STDERR and halt. void fail(refChar message, ...) { vaList arguments; vaStart(arguments, message); vfprintf(stderr, message, arguments); putc(eolChar, stderr); vaEnd(arguments); exit(1); } // IS ALL DIGITS. Test if STRING is a string of digits. bool isAllDigits(refChar string) { while (*string != eosChar) { if (isDigit(*string)) { string += 1; } else { return false; }} return true; } // STRING TO INT. If STRING is made up of digits then convert it to an INT. If // not, then scold the user about an invalid PARAMETER. int stringToInt(refChar string, refChar parameter) { if (isAllDigits(string)) { return atoi(string); } else { fail("%s: invalid %s %s", self, parameter, string); }} // MAKE HEADER. Make a new HEADER and return it as a GENERIC. refGeneric makeHeader(char type, int lower, int upper, refGeneric form) { refHeader newHeader = malloc(sizeof(headerNode)); newHeader->type = type; newHeader->lower = lower; newHeader->upper = upper; newHeader->form = form; return toRefGeneric(newHeader); } // MAKE LINK. Make a new LINK and return it as a GENERIC. refGeneric makeLink(refGeneric first, refGeneric rest) { refLink newLink = malloc(sizeof(linkNode)); newLink->type = linkType; newLink->first = first; newLink->rest = rest; return toRefGeneric(newLink); } // MAKE NAME. Return a new NAME. refName makeName(refChar string) { refName newName = malloc(sizeof(nameNode)); newName->type = nameType; newName->string = strcpy(malloc(strlen(string) + 1), string); newName->form = nil; newName->left = nil; newName->right = nil; return newName; } // INTERN NAME. Test if NAME TREE has a name with the given STRING slot. If it // does, return that name. If it does not, then make a new name, add it to the // tree, and return the new name. Names are returned as GENERICs. refGeneric internName(refChar string) { int test; refName subtree = nameTree; while (true) { test = strcmp(string, subtree->string); if (test < 0) { if (subtree->left == nil) { return toRefGeneric(subtree->left = makeName(string)); } else { subtree = subtree->left; }} else if (test > 0) { if (subtree->right == nil) { return toRefGeneric(subtree->right = makeName(string)); } else { subtree = subtree->right; }} else { return toRefGeneric(subtree); }}} // MAKE STRING. Make a new STRING and return it as a GENERIC. refGeneric makeString(refChar string) { refString newString = malloc(sizeof(stringNode)); newString->type = stringType; newString->string = strcpy(malloc(strlen(string) + 1), string); return toRefGeneric(newString); } // SYNTAX ERROR. Assert that the current character on LINE has the error ERR. void syntaxError(int err) { int mask = 1 << err; allErrs |= mask; *errsStart |= mask; } // WRITE ERROR LINE. Write LINE and its line number to STDERR. Then underneath // that, for each position in LINE where there's an error, write an ARROW CHAR // followed by a series of error numbers. We may need many such lines to show // all the errors. If there are no errors in LINE, write nothing. // // We first saw this done in Urs Amman's Pascal compiler for CDC 6000 machines // back in the 1970's, but it didn't use multiple error lines. void writeErrorLine() { int blanks; char delimiter; int error; int errors; bool going = false; // Find the first error. errsStart = errs; while (! going && errsStart < errsEnd) { going = (*errsStart != 0); errsStart += 1; } // If there was a first error, then write stuff. if (going) { fprintf(stderr, "%0*i %s", lineNumberLength, lineCount, line); while (going) { blanks = lineNumberLength; errsStart = errs; going = false; fputc(eolChar, stderr); while (errsStart < errsEnd) { if (*errsStart == 0) { blanks += 1; errsStart += 1; } else { going = true; while (blanks > 0) { fputc(' ', stderr); blanks -= 1; } errors = *errsStart; *errsStart = 0; delimiter = arrowChar; for (error = minErr + 1; error < maxErr; error += 1) { if ((errors & (1 << error)) != 0) { putc(delimiter, stderr); delimiter = ','; if (error > 9) { fprintf(stderr, "%2i", error); errsStart += 3; } else { fprintf(stderr, "%1i", error); errsStart += 2; }}}}}}}} // WRITE ERROR MESSAGE. Write a message about ERROR to STDERR. The message may // include STRING. void writeErrorMessage(int error, refChar string) { fprintf(stderr, "%2i: ", error); fprintf(stderr, message[error], string); fputc('\n', stderr); } // WRITE UNDEFINED MESSAGES. Write error messages about undefined names in the // tree of names SUBTREE. void writeUndefinedMessages(refName subtree) { while (subtree != nil) { writeUndefinedMessages(subtree->left); if (subtree->form == nil) { writeErrorMessage(undefinedNameErr, subtree->string); } subtree = subtree->right; }} // WRITE ERROR MESSAGES. Write every error message in the bitset ALL ERRS. void writeErrorMessages() { int error; fputc(eolChar, stderr); for (error = minErr + 1; error < maxErr; error += 1) { if ((allErrs & (1 << error)) != 0) { if (error == undefinedNameErr) { writeUndefinedMessages(nameTree); } else { writeErrorMessage(error, ""); }}}} // NEXT LINE. Read the next line from SOURCE into LINE, and initialize ERRS as // we go. void nextLine() { lineCount += 1; errsStart = errs; errsEnd = errs; lineStart = line; lineEnd = line; while (true) { ch = getc(source); if (ch == eofChar) { errsEnd[0] = 0; errsEnd[1] = 0; errsEnd += 2; lineEnd[0] = eogChar; lineEnd[1] = eosChar; lineEnd += 2; break; } else if (ch == eolChar) { *errsEnd = 0; errsEnd += 1; *lineEnd = eosChar; lineEnd += 1; break; } else { if (lineEnd < line + lineLength - 1) { if (ch < minChar || ch > maxChar) { ch = ' '; syntaxError(badCharErr); } *errsEnd = 0; errsEnd += 1; *lineEnd = ch; lineEnd += 1; } else { syntaxError(lineTooLongErr); }}}} // NEXT CHAR. If we're at the end of LINE then read the next line from SOURCE. // Then copy the next char from LINE into CH. void nextChar() { if (lineStart >= lineEnd) { writeErrorLine(); nextLine(); } ch = *lineStart; errsStart += 1; lineStart += 1; } // NEXT BAR. Parse the token for "|". void nextBar() { token = barToken; nextChar(); } // NEXT CLOSE PAREN. Parse the token for ")". void nextCloseParen() { token = closeParenToken; nextChar(); } // NEXT COMMA. Parse the token for ",". void nextComma() { token = commaToken; nextChar(); } // NEXT COMMENT. Parse a comment. void nextComment() { nextLine(); nextChar(); } // NEXT DOT. Parse the token for ".". void nextDot() { token = dotToken; nextChar(); } // NEXT END. Parse the token for the end of a source file. void nextEnd() { token = endToken; } // NEXT EQUAL. Parse the token for "=". void nextEqual() { token = equalToken; nextChar(); } // NEXT ILLEGAL. Parse the token for an illegal character. void nextIllegal() { syntaxError(badTokenErr); nextChar(); } // NEXT NAME. Parse the token for a name. void nextName() { char string[lineLength]; refChar stringStart = string; while (isNameChar(ch)) { *stringStart = ch; stringStart += 1; nextChar(); } *stringStart = eosChar; token = nameToken; tokenForm = internName(string); } // NEXT NUMBER. Parse the token for a positive integer. void nextNumber() { char string[lineLength]; refChar stringStart = string; while (isDigit(ch)) { *stringStart = ch; stringStart += 1; nextChar(); } *stringStart = eosChar; token = numberToken; tokenNumber = strtol(string, nil, 10); if (errno == outOfRange) { syntaxError(badNumberErr); }} // NEXT OPEN PAREN. Parse the token for "(". void nextOpenParen() { token = openParenToken; nextChar(); } // NEXT PLUS. Parse the token for "+". void nextPlus() { token = plusToken; nextChar(); } // NEXT QUESTION. Parse the token for "?". void nextQuestion() { token = questionToken; nextChar(); } // NEXT STAR. Parse the token for "*". void nextStar() { token = starToken; nextChar(); } // NEXT STRING. Parse the token for a string. We can handle the usual C letter // escapes, but not the numeric ones. Maybe later. void nextString() { char string[lineLength]; refChar stringEnd = string; nextChar(); while (ch != eosChar && ch != '"') { if (ch == '\\') { nextChar(); if (ch == eosChar) { syntaxError(charExpectedErr); } else { switch (ch) { case '"': { break; } case '\\': { break; } case 'a': { ch = '\a'; break; } case 'b': { ch = '\b'; break; } case 'f': { ch = '\f'; break; } case 'n': { ch = '\n'; break; } case 'r': { ch = '\r'; break; } case 't': { ch = '\t'; break; } case 'v': { ch = '\v'; break; } default: { syntaxError(badEscapeErr); ch = ' '; break; }} *stringEnd = ch; stringEnd += 1; nextChar(); }} else { *stringEnd = ch; stringEnd += 1; nextChar(); }} *stringEnd = eosChar; if (ch == '"') { nextChar(); } else { syntaxError(quoteExpectedErr); } tokenForm = makeString(string); token = stringToken; } // NEXT TOKEN. Set TOKEN to the next token from SOURCE. Select the appropriate // parsing function from TOKEN PARSER, then call that function. Continue until // we get a token other than BLANK TOKEN. refCode tokenParser[127] = { nextChar, // ^@ 0x00 A newline. Skip it. nextEnd, // ^A 0x01 End of file. Don't skip it. nextIllegal, // ^B 0x02 Chars 0x02 through 0x1F should never appear. nextIllegal, // ^C 0x03 nextIllegal, // ^D 0x04 nextIllegal, // ^E 0x05 nextIllegal, // ^F 0x06 nextIllegal, // ^G 0x07 nextIllegal, // ^H 0x08 nextIllegal, // ^I 0x09 nextIllegal, // ^J 0x0A nextIllegal, // ^K 0x0B nextIllegal, // ^L 0x0C nextIllegal, // ^M 0x0D nextIllegal, // ^N 0x0E nextIllegal, // ^O 0x0F nextIllegal, // ^P 0x10 nextIllegal, // ^Q 0x11 nextIllegal, // ^R 0x12 nextIllegal, // ^S 0x13 nextIllegal, // ^T 0x14 nextIllegal, // ^U 0x15 nextIllegal, // ^V 0x16 nextIllegal, // ^W 0x17 nextIllegal, // ^X 0x18 nextIllegal, // ^Y 0x19 nextIllegal, // ^Z 0x1A nextIllegal, // ^[ 0x1B nextIllegal, // ^\ 0x1C nextIllegal, // ^] 0x1D nextIllegal, // ^^ 0x1E nextIllegal, // ^_ 0x1F nextChar, // 0x20 A blank. Skip it. nextIllegal, // ! 0x21 nextString, // " 0x22 A string. Strings are terminal symbols. nextComment, // # 0x23 A comment. Skip the rest of the line. nextIllegal, // $ 0x24 nextIllegal, // % 0x25 nextIllegal, // & 0x26 nextIllegal, // ' 0x27 nextOpenParen, // ( 0x28 An open parenthesis starts a repeat count. nextCloseParen, // ) 0x29 A close parenthesis ends a repeat count. nextStar, // * 0x2A A Kleene star means repeat 0 or more times. nextPlus, // + 0x2B A Kleene plus means repeat 1 or more times. nextComma, // , 0x2C A comma delimits two repeat counts. nextIllegal, // - 0x2D nextDot, // . 0x2E A dot terminates a production. nextIllegal, // / 0x2F nextNumber, // 0 0x30 Digits begin nonnegative integers. nextNumber, // 1 0x31 nextNumber, // 2 0x32 nextNumber, // 3 0x33 nextNumber, // 4 0x34 nextNumber, // 5 0x35 nextNumber, // 6 0x36 nextNumber, // 7 0x37 nextNumber, // 8 0x38 nextNumber, // 9 0x39 nextIllegal, // : 0x3A nextIllegal, // ; 0x3B nextIllegal, // < 0x3C nextEqual, // = 0x3D An equal sign means a definition. nextIllegal, // > 0x3E nextQuestion, // ? 0x3F A question means an optional nonterminal. nextIllegal, // @ 0x40 nextName, // A 0x41 Letters begin nonterminal symbols. nextName, // B 0x42 nextName, // C 0x43 nextName, // D 0x44 nextName, // E 0x45 nextName, // F 0x46 nextName, // G 0x47 nextName, // H 0x48 nextName, // I 0x49 nextName, // J 0x4A nextName, // K 0x4B nextName, // L 0x4C nextName, // M 0x4D nextName, // N 0x4E nextName, // O 0x4F nextName, // P 0x50 nextName, // Q 0x51 nextName, // R 0x52 nextName, // S 0x53 nextName, // T 0x54 nextName, // U 0x55 nextName, // V 0x56 nextName, // W 0x57 nextName, // X 0x58 nextName, // Y 0x59 nextName, // Z 0x5A nextIllegal, // [ 0x5B nextIllegal, // \ 0x5C nextIllegal, // ] 0x5D nextIllegal, // ^ 0x5E nextIllegal, // _ 0x5F nextIllegal, // ` 0x60 nextName, // a 0x61 Letters begin nonterminal symbols. nextName, // b 0x62 nextName, // c 0x63 nextName, // d 0x64 nextName, // e 0x65 nextName, // f 0x66 nextName, // g 0x67 nextName, // h 0x68 nextName, // i 0x69 nextName, // j 0x6A nextName, // k 0x6B nextName, // l 0x6C nextName, // m 0x6D nextName, // n 0x6E nextName, // o 0x6F nextName, // p 0x70 nextName, // q 0x71 nextName, // r 0x72 nextName, // s 0x73 nextName, // t 0x74 nextName, // u 0x75 nextName, // v 0x76 nextName, // w 0x77 nextName, // x 0x78 nextName, // y 0x79 nextName, // z 0x7A nextIllegal, // { 0x7B nextBar, // | 0x7C A vertical bar means alternation. nextIllegal, // } 0x7D nextIllegal }; // ~ 0x7E void nextToken() { token = blankToken; tokenForm = nil; while (token == blankToken) { tokenParser[ch](); }} // EXPECT. Skip TOKEN if it's EXPECTED TOKEN. Issue an ERROR if it's not. void expect(int expectedToken, int error) { if (token == expectedToken) { nextToken(); } else { syntaxError(error); }} // NEXT TERM. Parse a term, and return it. A term is a name or string followed // by an optional "+", "?", or "*". It can also be followed by "(N)" or by // "(M, N)", where N and M are positive integer constants. refGeneric nextTerm() { refGeneric term; if (token == nameToken || token == stringToken) { term = tokenForm; } else { syntaxError(termExpectedErr); term = emptyString; } nextToken(); switch (token) { case openParenToken: { long lower = 0; long upper = 0; nextToken(); if (token == numberToken) { lower = tokenNumber; nextToken(); } else { syntaxError(numberExpectedErr); } if (token == commaToken) { nextToken(); if (token == numberToken) { upper = tokenNumber; nextToken(); } else { syntaxError(numberExpectedErr); }} else { upper = lower; } expect(closeParenToken, closeParenExpectedErr); return makeHeader(rangeType, lower, upper, term); } case plusToken: { nextToken(); return makeHeader(rangeType, 1, maxRepeats, term); } case questionToken: { nextToken(); return makeHeader(rangeType, 0, 1, term); } case starToken: { nextToken(); return makeHeader(rangeType, 0, maxRepeats, term); } default: { return term; }}} // NEXT CONJUNCTION. Parse a conjunction: a series of terms separated by white // space. We LINK the terms into an ordered list, put the list into a HEADER, // and return the HEADER. refGeneric nextConjunction() { refGeneric form = nextTerm(); if (token == nameToken || token == stringToken) { refLink first = makeLink(form, nil); refLink last = first; while (token == nameToken || token == stringToken) { last = (last->rest = makeLink(nextTerm(), nil)); } return makeHeader(andType, 0, 0, first); } else { return form; }} // NEXT DISJUNCTION. Parse a disjunction: a series of conjunctions, separated // by "|"s. We LINK the conjunctions into an unordered list, put the list into // a HEADER, and return the HEADER. refGeneric nextDisjunction() { refGeneric form = nextConjunction(); if (token == barToken) { int count = 1; refLink first = makeLink(form, nil); while (token == barToken) { nextToken(); first = makeLink(nextConjunction(), first); count += 1; } return makeHeader(orType, 1, count, first); } else { return form; }} // NEXT PRODUCTION. Parse a production: a name, followed by "=", followed by a // disjunction, followed by ".". If the name is undefined then define it to be // the disjunction. If it was already defined, then disjoin the disjunction to // the name's previous definition. void nextProduction() { refGeneric leftForm; refGeneric leftForms; int leftUpper; refName leftName; refGeneric rightForm; refGeneric rightForms; // Parse. if (token == nameToken) { leftName = toRefName(tokenForm); } else { syntaxError(nameExpectedErr); leftName = dummyName; } nextToken(); expect(equalToken, equalExpectedErr); rightForm = nextDisjunction(); expect(dotToken, dotExpectedErr); // Establish or update the definition. leftForm = leftName->form; if (leftForm == nil) { leftName->form = rightForm; } else if (leftForm->type == orType) { leftForms = toRefHeader(leftForm)->form; if (rightForm->type == orType) { while (leftForms->rest != nil) { leftForms = leftForms->rest; } leftForms->rest = toRefHeader(rightForm)->form; toRefHeader(leftForm)->upper += toRefHeader(rightForm)->upper; } else { toRefHeader(leftForm)->upper += 1; toRefHeader(leftForm)->form = makeLink(rightForm, leftForms); }} else if (rightForm->type == orType) { leftUpper = toRefHeader(rightForm)->upper + 1; rightForms = toRefHeader(rightForm)->form; leftForms = makeLink(leftForm, rightForms); leftName->form = makeHeader(orType, 0, leftUpper, leftForms); } else { leftForms = makeLink(leftForm, makeLink(rightForm, nil)); leftName->form = makeHeader(orType, 1, 2, leftForms); }} // HAS UNDEFINED NAME. Test if the tree of names SUBTREE contains at least one // undefined name. bool hasUndefinedName(refName subtree) { while (subtree != nil) { if (subtree->form == nil) { return true; } else if (hasUndefinedName(subtree->left)) { return true; } else { subtree = subtree->right; }} return false; } // NEXT GRAMMAR. Parse a series of productions. void nextGrammar() { ch = ' '; allErrs = 0; lineCount = 0; nextToken(); while (token != endToken) { nextProduction(); } if (hasUndefinedName(nameTree)) { syntaxError(undefinedNameErr); } expect(endToken, endExpectedErr); } // HOW MANY. Return a random INT between the LOWER and UPPER limits of HEADER, // inclusive. int howMany(refHeader header) { int lower = header->lower; int upper = header->upper - lower + 1; return lower + toInt(upper * toDouble(rand()) / (maxRandom + 1.0)); } // GENERATING. Do all the work for GENERATE (see below). Recursively generate // a random string from the grammar FORM. If we recurse more deeply than MAX // DEPTH lavels, or if we make a string longer than MAX LENGTH chars, then we // LONG JMP back to GENERATE. Otherwise we leave the generated string in the // buffer, starting at address BUFFER START. void generating(refGeneric form) { if (form == nil) { fail("%s: internal error 0", self); } else if (depth > maxDepth) { longjmp(stopGenerating, true); } else { switch (form->type) { case andType: { refGeneric forms = toRefHeader(form)->form; depth += 1; while (forms != nil) { generating(forms->first); forms = forms->rest; } depth -= 1; return; } case nameType: { depth += 1; generating(toRefName(form)->form); depth -= 1; return; } case orType: { int count = howMany(toRefHeader(form)); refGeneric forms = toRefHeader(form)->form; while (count > 1) { forms = forms->rest; count -= 1; } depth += 1; generating(forms->first); depth -= 1; return; } case rangeType: { int count = howMany(toRefHeader(form)); depth += 1; while (count > 0) { generating(toRefHeader(form)->form); count -= 1; } depth -= 1; return; } case stringType: { refChar string = toRefString(form)->string; int length = strlen(string); if (length < bufferEnd - bufferStart) { strcpy(bufferStart, string); bufferStart += length; return; } else { longjmp(stopGenerating, true); }} default: { fail("%s: internal error %i", self, form->type); }}}} // GENERATE. Allocate a string buffer, and repeatedly call GENERATING until we // get a string in BUFFER. Then print the string. void generate() { char buffer[maxLength]; while (true) { if (! setjmp(stopGenerating)) { depth = 0; bufferStart = buffer; bufferEnd = buffer + maxLength; generating(toRefGeneric(startName)); fputs(buffer, stdout); return; }}} // MAIN. The main program. int main(int argc, refRefChar argv) { int stringCount; // Initialize. dummyName = makeName(""); emptyString = makeString(""); startName = makeName("start"); nameTree = startName; // Establish numeric options from the command line. maxDepth = 1024; // Option -d default. maxLength = 80; // Option -l default. stringCount = 1; // Option -n default. maxRepeats = 5; // Option -r default. seed = time(nil) % maxInt; // Option -s default. self = *argv; argc -= 1; argv += 1; while (argc > 0 && **argv == '-') { if (strlen(*argv) == 2) { switch ((*argv)[1]) { case 'd': { argc -= 1; argv += 1; maxDepth = stringToInt(*argv, "depth"); break; } case 'l': { argc -= 1; argv += 1; maxLength = stringToInt(*argv, "string length") + 1; break; } case 'n': { argc -= 1; argv += 1; stringCount = stringToInt(*argv, "string count"); break; } case 'r': { argc -= 1; argv += 1; maxRepeats = stringToInt(*argv, "repetition count"); break; } case 's': { argc -= 1; argv += 1; seed = stringToInt(*argv, "random seed"); break; } default: { fail("%s: invalid option %s", self, *argv); }} argc -= 1; argv += 1; } else { fail("%s: invalid option %s", self, *argv); }} // Read a grammar from the file named on the command line. If there is no such // file, then read the grammar from STDIN instead. if (argc > 0) { source = fopen(*argv, "r"); if (source == nil) { fail("%s: cannot open %s", self, *argv); } else { nextGrammar(); fclose(source); }} else { source = stdin; nextGrammar(); } // If there were errors in the grammar then write error messages. Otherwise we // generate strings from the grammar. if (allErrs == 0) { srand(seed); while (stringCount > 0) { generate(); stringCount -= 1; } return 0; } else { writeErrorLine(); writeErrorMessages(); return 1; }}