Skip to content

Commit

Permalink
add antlr test case, corpus, results
Browse files Browse the repository at this point in the history
  • Loading branch information
parrt committed Apr 8, 2016
1 parent e224ba7 commit e09a422
Show file tree
Hide file tree
Showing 7 changed files with 2,416 additions and 96 deletions.
331 changes: 331 additions & 0 deletions corpus/antlr4/samples/ANTLRv4Lexer.g4
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
/*
* [The "BSD license"]
* Copyright (c) 2014 Terence Parr
* Copyright (c) 2014 Sam Harwell
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/** A grammar for ANTLR v4 tokens */
lexer grammar ANTLRv4Lexer;

tokens {
TOKEN_REF,
RULE_REF,
LEXER_CHAR_SET
}

@members {
/** Track whether we are inside of a rule and whether it is lexical parser.
* _currentRuleType==Token.INVALID_TYPE means that we are outside of a rule.
* At the first sign of a rule name reference and _currentRuleType==invalid,
* we can assume that we are starting a parser rule. Similarly, seeing
* a token reference when not already in rule means starting a token
* rule. The terminating ';' of a rule, flips this back to invalid type.
*
* This is not perfect logic but works. For example, "grammar T;" means
* that we start and stop a lexical rule for the "T;". Dangerous but works.
*
* The whole point of this state information is to distinguish
* between [..arg actions..] and [charsets]. Char sets can only occur in
* lexical rules and arg actions cannot occur.
*/
private int _currentRuleType = Token.INVALID_TYPE;
public int getCurrentRuleType() {
return _currentRuleType;
}

public void setCurrentRuleType(int ruleType) {
this._currentRuleType = ruleType;
}

protected void handleBeginArgAction() {
if (inLexerRule()) {
pushMode(LexerCharSet);
more();
}
else {
pushMode(ArgAction);
more();
}
}

@Override
public Token emit() {
if (_type == TOKEN_REF || _type==RULE_REF ) {
if (_currentRuleType == Token.INVALID_TYPE) { // if outside of rule def
_currentRuleType = _type; // set to inside lexer or parser rule
}
}
else if (_type == SEMI) { // exit rule def
_currentRuleType = Token.INVALID_TYPE;
}

return super.emit();
}

private boolean inLexerRule() {
return _currentRuleType == TOKEN_REF;
}
private boolean inParserRule() { // not used, but added for clarity
return _currentRuleType == RULE_REF;
}
}

DOC_COMMENT
: '/**' .*? ('*/' | EOF)
;

BLOCK_COMMENT
: '/*' .*? ('*/' | EOF) -> channel(HIDDEN)
;

LINE_COMMENT
: '//' ~[\r\n]* -> channel(HIDDEN)
;

BEGIN_ARG_ACTION
: '[' {handleBeginArgAction();}
;

// OPTIONS and TOKENS must also consume the opening brace that captures
// their option block, as this is the easiest way to parse it separate
// to an ACTION block, despite it using the same {} delimiters.
//
OPTIONS : 'options' [ \t\f\n\r]* '{' ;
TOKENS : 'tokens' [ \t\f\n\r]* '{' ;
CHANNELS : 'channels' [ \t\f\n\r]* '{' ;

IMPORT : 'import' ;
FRAGMENT : 'fragment' ;
LEXER : 'lexer' ;
PARSER : 'parser' ;
GRAMMAR : 'grammar' ;
PROTECTED : 'protected' ;
PUBLIC : 'public' ;
PRIVATE : 'private' ;
RETURNS : 'returns' ;
LOCALS : 'locals' ;
THROWS : 'throws' ;
CATCH : 'catch' ;
FINALLY : 'finally' ;
MODE : 'mode' ;

COLON : ':' ;
COLONCOLON : '::' ;
COMMA : ',' ;
SEMI : ';' ;
LPAREN : '(' ;
RPAREN : ')' ;
RARROW : '->' ;
LT : '<' ;
GT : '>' ;
ASSIGN : '=' ;
QUESTION : '?' ;
STAR : '*' ;
PLUS : '+' ;
PLUS_ASSIGN : '+=' ;
OR : '|' ;
DOLLAR : '$' ;
DOT : '.' ;
RANGE : '..' ;
AT : '@' ;
POUND : '#' ;
NOT : '~' ;
RBRACE : '}' ;

/** Allow unicode rule/token names */
//ID : NameStartChar NameChar*;
// ##################### to allow testing ANTLR grammars in intellij preview
RULE_REF : [a-z][a-zA-Z_0-9]* ;
TOKEN_REF : [A-Z][a-zA-Z_0-9]* ;


fragment
NameChar
: NameStartChar
| '0'..'9'
| '_'
| '\u00B7'
| '\u0300'..'\u036F'
| '\u203F'..'\u2040'
;

fragment
NameStartChar
: 'A'..'Z'
| 'a'..'z'
| '\u00C0'..'\u00D6'
| '\u00D8'..'\u00F6'
| '\u00F8'..'\u02FF'
| '\u0370'..'\u037D'
| '\u037F'..'\u1FFF'
| '\u200C'..'\u200D'
| '\u2070'..'\u218F'
| '\u2C00'..'\u2FEF'
| '\u3001'..'\uD7FF'
| '\uF900'..'\uFDCF'
| '\uFDF0'..'\uFFFD'
; // ignores | ['\u10000-'\uEFFFF] ;

INT : [0-9]+
;

// ANTLR makes no distinction between a single character literal and a
// multi-character string. All literals are single quote delimited and
// may contain unicode escape sequences of the form \uxxxx, where x
// is a valid hexadecimal number (as per Java basically).
STRING_LITERAL
: '\'' (ESC_SEQ | ~['\r\n\\])* '\''
;

UNTERMINATED_STRING_LITERAL
: '\'' (ESC_SEQ | ~['\r\n\\])*
;
// Any kind of escaped character that we can embed within ANTLR
// literal strings.
fragment
ESC_SEQ
: '\\'
( // The standard escaped character set such as tab, newline, etc.
[btnfr"'\\]
| // A Java style Unicode escape sequence
UNICODE_ESC
| // Invalid escape
.
| // Invalid escape at end of file
EOF
)
;

fragment
UNICODE_ESC
: 'u' (HEX_DIGIT (HEX_DIGIT (HEX_DIGIT HEX_DIGIT?)?)?)?
;

fragment
HEX_DIGIT : [0-9a-fA-F] ;

WS : [ \t\r\n\f]+ -> channel(HIDDEN) ;

// Many language targets use {} as block delimiters and so we
// must recursively match {} delimited blocks to balance the
// braces. Additionally, we must make some assumptions about
// literal string representation in the target language. We assume
// that they are delimited by ' or " and so consume these
// in their own alts so as not to inadvertantly match {}.

ACTION
: '{'
( ACTION
| ACTION_ESCAPE
| ACTION_STRING_LITERAL
| ACTION_CHAR_LITERAL
| '/*' .*? '*/' // ('*/' | EOF)
| '//' ~[\r\n]*
| .
)*?
('}'|EOF)
;

fragment
ACTION_ESCAPE
: '\\' .
;

fragment
ACTION_STRING_LITERAL
: '"' (ACTION_ESCAPE | ~["\\])* '"'
;
fragment
ACTION_CHAR_LITERAL
: '\'' (ACTION_ESCAPE | ~['\\])* '\''
;

// -----------------
// Illegal Character
//
// This is an illegal character trap which is always the last rule in the
// lexer specification. It matches a single character of any value and being
// the last rule in the file will match when no other rule knows what to do
// about the character. It is reported as an error but is not passed on to the
// parser. This means that the parser to deal with the gramamr file anyway
// but we will not try to analyse or code generate from a file with lexical
// errors.
//
ERRCHAR
: . -> channel(HIDDEN)
;

mode ArgAction; // E.g., [int x, List<String> a[]]

NESTED_ARG_ACTION
: '[' -> more, pushMode(ArgAction)
;

ARG_ACTION_ESCAPE
: '\\' . -> more
;

ARG_ACTION_STRING_LITERAL
: ('"' ('\\' . | ~["\\])* '"')-> more
;
ARG_ACTION_CHAR_LITERAL
: ('"' '\\' . | ~["\\] '"') -> more
;

ARG_ACTION
: ']' -> popMode
;

UNTERMINATED_ARG_ACTION // added this to return non-EOF token type here. EOF did something weird
: EOF -> popMode
;

ARG_ACTION_CHAR // must be last
: . -> more
;


mode LexerCharSet;

LEXER_CHAR_SET_BODY
: ( ~[\]\\]
| '\\' .
)
-> more
;

LEXER_CHAR_SET
: ']' -> popMode
;

UNTERMINATED_CHAR_SET
: EOF -> popMode
;
Loading

0 comments on commit e09a422

Please sign in to comment.