A framework for parsing program source files.
Parsing source file implies a context-free grammar. Such a grammar describes how an addition is written, a function call, etc.. in the source file. Here, a grammar is described as a PEG (parsing expression grammar) instead of the classical Backus-Naur Form (BNF). The improved formalism of PEG imports some features of regular expressions so that :
Grammar_.h provides a framework for encoding grammar rules in PEG and interpreting them. The root grammar rule can be repeatedly applied to a stream of terminal tokens (from Lexer_.h), in order to convert it into a stream of syntactic trees (where an addition is a 'composite token' with 2 'child tokens'). The parser uses a simplified backrat algorithm to match grammar rules : during backtracking, only the latest, partial pattern match is saved (instead of all pattern matches).
PEG Grammar - identical to the BNF grammar in this case :
root::= (variable | function)*
variable ::= 'int'? identifier ';'
function ::= identifier '()' '{' variable* '}'
Input :
int var;
;
foo(){var;var;}
foo2(){;}
Output :
[token 2 declares int var] This may be customized
minimalFile:5 - syntax error : ';' found instead of <anIdentifier>, 'int'
[token 5 declares foo()]
[token 8 declares var]
[token 10 declares var]
[<{<noToken>}>] These are the curly braces that need no particularly handling
[token 13 declares foo2()]
minimalFile:17 - syntax error after '{' : ';' found instead of <anIdentifier>, 'int', '}'
minimalFile:17 - syntax error : ';' found instead of <anIdentifier>, 'int', '}'
minimalFile:18 - syntax error : '}' found instead of <anIdentifier>, 'int'
Note : tokens are counted instead of lines in this minimal implementation.
| BNF | declaration | allocation |
| a::= ';' | GrammarTerminal a( &semicolonToken); | THE( semicolonToken) |
| a::= <identifier> | const char* idString= "anIdentifier"; | A_( idString) |
| a::= <any token> | SINGLE_TOKEN | |
| a::= b c | GrammarSequence a( &b, &c, 0); | SEQUENCE( &b, &c, 0) |
| flush output from within a GrammarSequence( .., 0) | HANDLE( aUniqueIntValue) | |
| a::= b | c | GrammarSwitch a( &b, &c, 0); | EITHER( &b, &c, 0) |
| a::= (b | c) d | GrammarSequence a( EITHER( &b, &c, 0), &d, 0); | SEQUENCE( EITHER( &b, &c, 0), &d, 0) |
| a::= b? | GrammarPossible a( &b); | MAYBE( &b) |
| a::= b* | GrammarLoop a( &b) GrammarLoop a( &b, false); | HANDLE_ANY( &b) ANY( &b) |
| //+ a::= b+ | MANY( &b) | |
| a::= !b | GrammarNot a( &b); | NOT( &b) |
| //+ a::= &b |
GrammarSwitch will be evaluated in given order (prioritized parsing). GrammarLoop consumes as much input as possible (greedy parsing).
Recommendation : within a GrammarSwitch( ..) or EITHER( ..), use only THE(), A_() and &aRule [ and no SEQUENCE(), EITHER(), etc.., so that the syntactic tree generated can easily be identified by its type_ field]
enum { declarationSyntax=1, functionSyntax }; // passed to pSink->handlePattern( int, SyntacticTree*);
TerminalToken int_( "int"),
semicolon_( ";"),
braces_( "()"),
leftBrace( "{"),
rightBrace( "}");
const char* identifier_= "anIdentifier"; // the string that appears in error reports
// implementation note : unique string shared by all A_( identifier_), for fast pattern matching
GrammarSequence variable_( MAYBE( THE( int_)), A_( identifier_), THE( semicolon_), HANDLE( declarationSyntax), 0);
GrammarSequence function_( A_( identifier_), THE( braces_), HANDLE( functionSyntax),
THE( leftBrace), HANDLE_ANY( &variable_), THE( rightBrace), 0);
GrammarLoop root_( EITHER( &variable_, &function_, 0));
// implementation note : '&' before 'variable_' is unavoidable
// because a variable parameter list cannot hold a reference like just 'variable_'
Operation :
class C_tokenFiles : TokenStream {}; // user-supplied lexical analyser, derived from Lexer_
class MinimalSink : SyntacticSink { // user-supplied handler
bool handlePattern( int i_, SyntacticTree* t_);
};
main() {
Token_::currentOut= stderr; // (or stdout) select destination of error messages
C_tokenFiles source_;
if (source_.set_( "path/fn.c") ==0)
printf( "cannot open file 'path/fn.c'\n");
MinimalSink sink_;
root_.parseFromTo( &source_, &sink_);
}
Syntactic trees reported :
minimalSink.handlePattern( declarationSyntax, SyntacticTree( variable_.type_, int_, VariableToken( identifier_, "var"), semicolon_, 0)) //c_tokenFile.printWarning(); minimalSink.handlePattern( functionSyntax, SyntacticTree( function_.type_, VariableToken( identifier_, "foo"), braces_, 0) minimalSink.handlePattern( declarationSyntax, SyntacticTree( variable_.type_, noToken, VariableToken( identifier_, "var"), semicolon_, 0)) minimalSink.handlePattern( declarationSyntax, SyntacticTree( variable_.type_, noToken, VariableToken( identifier_, "var"), semicolon_, 0)) //c_tokenFile.printWarning(); minimalSink.handle_( SyntacticTree( 0, leftBrace, SyntactiTree( 0, noToken, 0), rightBrace, 0); minimalSink.handlePattern( functionSyntax, SyntacticTree( function_.type_, VariableToken( identifier_, "foo"), braces_, 0) //c_tokenFile.printWarning(); //c_tokenFile.printWarning(); //c_tokenFile.printWarning();