Recursive Descent Parser
Most of the times when developping, people need a parser for a simple language and do not need to get the "big guns" to make a robust parser with Lex and Yacc.
Here is one example of a Recursive-Descent Parser, based on the simplest you can find the "The Dragon Book" (i.e. _Compilers Principles Techniques and Tools_, by Aho, Sethi, Ullman). It can be inserted in any project and only uses the C++ standard library strings as a dependency.
Example of input for this parser:
#MWC 1.0
7 22
26000 4
cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 1 0x0 0x0 0
cmagentFACELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
26001 1
cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 0 0x0 0x0 0
26002 1
cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 0 0x0 0x0 0
26003 1
cmagentFACELIGHT so12_d00greetso0r_l1f 1 0 0x0 0x0 0
26004 2
cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 1 0x0 0x0 0
cmagentFACELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
26005 12
cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentMOUTHPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentHEADPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentLEGPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentTAILPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentEARPERFORMER a_stand#stand_so0r_greet 1 1 0x0 0x0 0
cmagentSOUNDPERFORMER soc_d00greetso0r_x1x 1 1 0x0 0x0 0
cmagentMODELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
cmagentFACELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
cmagentEARLIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
cmagentBACKLIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
cmagentLIVELIGHT so12_d00greetso0r_l1f 1 1 0x0 0x0 0
26006 1
cmagentMOTIONPERFORMER a_stand#stand_so0r_greet 2 3 0x04 0x05 06
Grammar
Here is the grammar for the language to parse:
mwccfgfile : version totals mcids
version: HASH IDENT FLOAT // must be #MWC 1.0
totals: INT INT // total number of MCID, total number of CMAC
mcids : mcid | mcid mcids
mcid : INT INT cmacs // MCID ident, number of CMACs in action
cmacs : cmac | cmac cmacs
cmac : IDENT IDENT INT INT HEX HEX INT // CMAC type, CMAC ident, repeat count, synchronous flag, reserved, reserved, reserved
Parser
Here is the code that implements the parser:
// Predictive, Recursive Descent Parser
// Minimal error recovery
#include
using namespace std;
#include
#include
enum Token { TOKEN_UNKNOWN, TOKEN_INT, TOKEN_HEX, TOKEN_IDENT, TOKEN_HASH, TOKEN_FLOAT, TOKEN_EOF };
FILE *mwcCfgFile = NULL;
Token lookahead = TOKEN_UNKNOWN;
string value = "";
int currentLine = 1;
string errorReason = "";
int openFile(char *fileName)
{
mwcCfgFile = fopen(fileName, "r");
if (!mwcCfgFile)
return 1;
currentLine = 1;
return 0;
}
char getCharacter()
{
char c = getc(mwcCfgFile);
if (c=='\n')
currentLine++;
return c;
}
void ungetCharacter(char c)
{
if (c=='\n')
currentLine--;
ungetc(c, mwcCfgFile);
}
void closeFile()
{
fclose(mwcCfgFile);
}
Token getToken()
{
char c = 0;
bool hexInteger = false;
// Find out what this can be:
// digit: INT or FLOAT
// letter, followed by one or many (letter, digits, _, #): IDENT
// #: HASH
value = "";
// Skip whitespaces
do {
c = getCharacter();
} while (isspace(c) && !feof(mwcCfgFile));
// Process end of file
if (feof(mwcCfgFile))
return TOKEN_EOF;
// Try to parse an INT or a FLOAT
if (isdigit(c)) {
// gather all digits for an INT or a HEX
do {
// Add character to value
value += c;
c = getCharacter();
// Detect the switch to a hex digit
if (value.length()==1 && tolower(c)=='x')
hexInteger = true;
} while (isdigit(c) || (hexInteger && isxdigit(c)) || (hexInteger && value.length()==1 && tolower(c)=='x'));
// Stop right here if this is an hex number
if (hexInteger) {
if (!feof(mwcCfgFile))
ungetCharacter(c);
return TOKEN_HEX;
}
// If followed by a period, then it's considered a FLOAT, otherwise an INT
if (c != '.') {
if (!feof(mwcCfgFile))
ungetCharacter(c);
return TOKEN_INT;
}
// read in the decimals of the FLOAT
do {
// Add character to value
value += c;
// Get next character
c = getCharacter();
} while (isdigit(c));
if (!feof(mwcCfgFile))
ungetCharacter(c);
return TOKEN_FLOAT;
// Try to parse an IDENT
} else if (isalpha(c)) {
do {
// Add character to value
value += c;
// Get next character
c = getCharacter();
} while (isalpha(c) || isdigit(c) || c=='_' || c=='#');
if (!feof(mwcCfgFile))
ungetCharacter(c);
return TOKEN_IDENT;
// Try to parse a HASH
} else if (c == '#') {
// Add character to value
value += c;
return TOKEN_HASH;
}
// Unknown character
return TOKEN_UNKNOWN;
}
char *tokenName(Token t)
{
switch(t) {
case TOKEN_INT:
return "integer";
case TOKEN_HEX:
return "hex number";
case TOKEN_IDENT:
return "identifier";
case TOKEN_HASH:
return "'#'";
case TOKEN_FLOAT:
return "floating point number";
case TOKEN_EOF:
return "end-of-file";
}
return "UNKNOWN";
}
void match(Token t)
{
if (lookahead == t)
lookahead = getToken();
else {
errorReason = string("Expected ") + string(tokenName(t)) + string(", received '") + value + string("'");
throw currentLine;
}
}
/*
Grammar for MWC.CFG files:
mwccfgfile : version totals mcids
version: HASH IDENT FLOAT // must be #MWC 1.0
totals: INT INT // total number of MCID, total number of CMAC
mcids : mcid | mcid mcids
mcid : INT INT cmacs // MCID ident, number of CMACs in action
cmacs : cmac | cmac cmacs
cmac : IDENT IDENT INT INT HEX HEX INT // CMAC type, CMAC ident, repeat count, synchronous flag, reserved, reserved, reserved
*/
int CMAC()
{
match(TOKEN_IDENT);
match(TOKEN_IDENT);
match(TOKEN_INT);
match(TOKEN_INT);
match(TOKEN_HEX);
match(TOKEN_HEX);
match(TOKEN_INT);
return 0;
}
int CMACs()
{
CMAC();
if (lookahead == TOKEN_IDENT)
CMACs();
return 0;
}
int MCID()
{
match(TOKEN_INT);
match(TOKEN_INT);
CMACs();
return 0;
}
int MCIDs()
{
MCID();
if (lookahead == TOKEN_INT)
MCIDs();
return 0;
}
int totals()
{
match(TOKEN_INT);
match(TOKEN_INT);
return 0;
}
int version()
{
match(TOKEN_HASH);
// lookahead must be IDENT == "MWC"
if (lookahead!=TOKEN_IDENT || value!="MWC") {
errorReason = string("Expected 'MWC', received '") + value + string("'");
throw currentLine;
}
match(TOKEN_IDENT);
// lookahead must be FLOAT == "1.0"
if (lookahead!=TOKEN_FLOAT || value!="1.0") {
errorReason = string("Expected '1.0', received '") + value + string("'");
throw currentLine;
}
match(TOKEN_FLOAT);
return 0;
}
int MWCCFGFile()
{
version();
totals();
MCIDs();
return 0;
}
int parseMWCCFGFile(char *fileName, string &errorMessage)
{
openFile(fileName);
// Parse the file
try {
lookahead = getToken();
MWCCFGFile();
} catch(int lineWithError) {
char temp[11];
itoa(lineWithError, temp, 10);
errorMessage = string("Parse error around line ") + string(temp) + string(": ") + errorReason + string(".");
return lineWithError;
}
closeFile();
return 0;
}