Recursive Descent Parser

Most of the times when developping, people need a parser for a simple language and do not need to get the "big guns" to make a robust parser with Lex and Yacc.

Here is one example of a Recursive-Descent Parser, based on the simplest you can find the "The Dragon Book" (i.e. _Compilers Principles Techniques and Tools_, by Aho, Sethi, Ullman). It can be inserted in any project and only uses the C++ standard library strings as a dependency.

Example of input for this parser:

#MWC 1.0 7 22 26000 4 cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 1 0x0 0x0 0 cmagentFACELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 26001 1 cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 0 0x0 0x0 0 26002 1 cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 0 0x0 0x0 0 26003 1 cmagentFACELIGHT so12_d00greetso0r_l1f 1 0 0x0 0x0 0 26004 2 cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 1 0x0 0x0 0 cmagentFACELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 26005 12 cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentMOUTHPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentHEADPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentLEGPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentTAILPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentEARPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0 cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 1 0x0 0x0 0 cmagentMODELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 cmagentFACELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 cmagentEARLIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 cmagentBACKLIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 cmagentLIVELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0 26006 1 cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 2 3 0x04 0x05 06

Grammar

Here is the grammar for the language to parse:

mwccfgfile : version totals mcids version: HASH IDENT FLOAT // must be #MWC 1.0 totals: INT INT // total number of MCID, total number of CMAC mcids : mcid | mcid mcids mcid : INT INT cmacs // MCID ident, number of CMACs in action cmacs : cmac | cmac cmacs cmac : IDENT IDENT INT INT HEX HEX INT // CMAC type, CMAC ident, repeat count, synchronous flag, reserved, reserved, reserved

Parser

Here is the code that implements the parser:

// Predictive, Recursive Descent Parser // Minimal error recovery #include using namespace std; #include #include enum Token { TOKEN_UNKNOWN, TOKEN_INT, TOKEN_HEX, TOKEN_IDENT, TOKEN_HASH, TOKEN_FLOAT, TOKEN_EOF }; FILE *mwcCfgFile = NULL; Token lookahead = TOKEN_UNKNOWN; string value = ""; int currentLine = 1; string errorReason = ""; int openFile(char *fileName) { mwcCfgFile = fopen(fileName, "r"); if (!mwcCfgFile) return 1; currentLine = 1; return 0; } char getCharacter() { char c = getc(mwcCfgFile); if (c=='\n') currentLine++; return c; } void ungetCharacter(char c) { if (c=='\n') currentLine--; ungetc(c, mwcCfgFile); } void closeFile() { fclose(mwcCfgFile); } Token getToken() { char c = 0; bool hexInteger = false; // Find out what this can be: // digit: INT or FLOAT // letter, followed by one or many (letter, digits, _, #): IDENT // #: HASH value = ""; // Skip whitespaces do { c = getCharacter(); } while (isspace(c) && !feof(mwcCfgFile)); // Process end of file if (feof(mwcCfgFile)) return TOKEN_EOF; // Try to parse an INT or a FLOAT if (isdigit(c)) { // gather all digits for an INT or a HEX do { // Add character to value value += c; c = getCharacter(); // Detect the switch to a hex digit if (value.length()==1 && tolower(c)=='x') hexInteger = true; } while (isdigit(c) || (hexInteger && isxdigit(c)) || (hexInteger && value.length()==1 && tolower(c)=='x')); // Stop right here if this is an hex number if (hexInteger) { if (!feof(mwcCfgFile)) ungetCharacter(c); return TOKEN_HEX; } // If followed by a period, then it's considered a FLOAT, otherwise an INT if (c != '.') { if (!feof(mwcCfgFile)) ungetCharacter(c); return TOKEN_INT; } // read in the decimals of the FLOAT do { // Add character to value value += c; // Get next character c = getCharacter(); } while (isdigit(c)); if (!feof(mwcCfgFile)) ungetCharacter(c); return TOKEN_FLOAT; // Try to parse an IDENT } else if (isalpha(c)) { do { // Add character to value value += c; // Get next character c = getCharacter(); } while (isalpha(c) || isdigit(c) || c=='_' || c=='#'); if (!feof(mwcCfgFile)) ungetCharacter(c); return TOKEN_IDENT; // Try to parse a HASH } else if (c == '#') { // Add character to value value += c; return TOKEN_HASH; } // Unknown character return TOKEN_UNKNOWN; } char *tokenName(Token t) { switch(t) { case TOKEN_INT: return "integer"; case TOKEN_HEX: return "hex number"; case TOKEN_IDENT: return "identifier"; case TOKEN_HASH: return "'#'"; case TOKEN_FLOAT: return "floating point number"; case TOKEN_EOF: return "end-of-file"; } return "UNKNOWN"; } void match(Token t) { if (lookahead == t) lookahead = getToken(); else { errorReason = string("Expected ") + string(tokenName(t)) + string(", received '") + value + string("'"); throw currentLine; } } /* Grammar for MWC.CFG files: mwccfgfile : version totals mcids version: HASH IDENT FLOAT // must be #MWC 1.0 totals: INT INT // total number of MCID, total number of CMAC mcids : mcid | mcid mcids mcid : INT INT cmacs // MCID ident, number of CMACs in action cmacs : cmac | cmac cmacs cmac : IDENT IDENT INT INT HEX HEX INT // CMAC type, CMAC ident, repeat count, synchronous flag, reserved, reserved, reserved */ int CMAC() { match(TOKEN_IDENT); match(TOKEN_IDENT); match(TOKEN_INT); match(TOKEN_INT); match(TOKEN_HEX); match(TOKEN_HEX); match(TOKEN_INT); return 0; } int CMACs() { CMAC(); if (lookahead == TOKEN_IDENT) CMACs(); return 0; } int MCID() { match(TOKEN_INT); match(TOKEN_INT); CMACs(); return 0; } int MCIDs() { MCID(); if (lookahead == TOKEN_INT) MCIDs(); return 0; } int totals() { match(TOKEN_INT); match(TOKEN_INT); return 0; } int version() { match(TOKEN_HASH); // lookahead must be IDENT == "MWC" if (lookahead!=TOKEN_IDENT || value!="MWC") { errorReason = string("Expected 'MWC', received '") + value + string("'"); throw currentLine; } match(TOKEN_IDENT); // lookahead must be FLOAT == "1.0" if (lookahead!=TOKEN_FLOAT || value!="1.0") { errorReason = string("Expected '1.0', received '") + value + string("'"); throw currentLine; } match(TOKEN_FLOAT); return 0; } int MWCCFGFile() { version(); totals(); MCIDs(); return 0; } int parseMWCCFGFile(char *fileName, string &errorMessage) { openFile(fileName); // Parse the file try { lookahead = getToken(); MWCCFGFile(); } catch(int lineWithError) { char temp[11]; itoa(lineWithError, temp, 10); errorMessage = string("Parse error around line ") + string(temp) + string(": ") + errorReason + string("."); return lineWithError; } closeFile(); return 0; }