Skip to content

Commit

Permalink
opLEM: reject wildcards with error message.
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodmo committed Dec 12, 2024
1 parent 582abd8 commit 94b2fa6
Show file tree
Hide file tree
Showing 9 changed files with 232 additions and 108 deletions.
4 changes: 2 additions & 2 deletions doTestQuery
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
# 06.12.24/FB

# current Koral lib:
KJAR=./target/Koral-0.42.0.jar
KJAR=./target/Koral-0.44.0.jar
# Calling that main class:
MCL=de.ids_mannheim.korap.query.serialize.QuerySerializer
# Query Language:
QL="cosmas2"

Query="&Fes-&Mann"
Query="&F+&Mann+"
Query="ab /s0 &F+&Mann+"
Query="ab /s0 &?&+&*&Mann"

echo -e "Query = '$Query' in QL='$QL'..."

Expand Down
16 changes: 10 additions & 6 deletions src/main/antlr/cosmas/c2ps.g
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
// - more generally: comma at end of searchword, which is not enclosed by "..." is
// excluded from searchword now.
// - a comma inside a searchword is accepted if enclosed by "...".
//
// 10.12.24/FB
// - skip wildcards [?*+] in lemma search expression, as regex/wildcards are not allowed
// in &opts&lemma, but instead they may appear as options in 'opts'.
// E.g. &F+&Prüfung -> lemma with F+ as an option.
// - test added for F+.
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

grammar c2ps;
Expand Down Expand Up @@ -236,8 +240,8 @@ searchExpr
-> $se1+ ;

searchExpr1
: op1 -> {$op1.tree}
| searchWord -> {$searchWord.tree}
: op1 -> {$op1.tree}
| searchWord -> {$searchWord.tree}
| searchLemma -> {$searchLemma.tree}
| searchAnnot -> {$searchAnnot.tree}
| searchLabel -> {$searchLabel.tree}
Expand All @@ -250,13 +254,13 @@ searchWord
: word1
| word2;

word1 : SEARCHWORD1 -> {c2ps_opWF.check($SEARCHWORD1.text, false, false, $SEARCHWORD1.index)} ;
word1 : SEARCHWORD1 -> {c2ps_opWF.check($SEARCHWORD1.text, false, false, $SEARCHWORD1.pos)} ;

word2 : SEARCHWORD2 -> {c2ps_opWF.check($SEARCHWORD2.text, true, false, $SEARCHWORD2.index)} ;
word2 : SEARCHWORD2 -> {c2ps_opWF.check($SEARCHWORD2.text, true, false, $SEARCHWORD2.pos)} ;

// Suchbegriff = Lemma:
searchLemma
: SEARCHLEMMA -> {c2ps_opWF.check($SEARCHLEMMA.text, false, true, $SEARCHLEMMA.index)} ;
: SEARCHLEMMA -> {c2ps_opWF.check($SEARCHLEMMA.text, false, true, $SEARCHLEMMA.pos)} ;

// Suchbegriff = Annotationsoperator:
// (damit Lexer den richtige Token erzeugt, muss OP_ELEM den gesamten
Expand Down
18 changes: 9 additions & 9 deletions src/main/antlr/cosmas/c2ps_opWF.g
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ WS : (' ')+ {skip();};
//
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

searchWFs
: searchWF+;
searchWFs[int pos]
: searchWF[pos]+;

searchWF: optCase? wordform tpos?
searchWF[int pos] : optCase? wordform[OPWF,pos] tpos?

-> ^(OPWF wordform optCase? tpos? ) ;

wordform: WF -> {c2ps_opWF.encode($WF.text, OPWF)};
wordform[int type, int pos]: WF -> {c2ps_opWF.encode($WF.text, $type, $pos)};

// Case Options:
optCase : Case
Expand All @@ -67,8 +67,8 @@ tpos : TPos
-> ^(TPOS {c2ps_opBED.checkTPos($TPos.text, $TPos.index)});

// analog für Lemmata, kein optCase:
searchLEM
: wordform tpos?
-> ^(OPLEM wordform tpos?);
// todo: check wordform (=lemma) for wildcards, which are not allowed in the lemma expr.

searchLEM[int pos]
: wordform[OPLEM,pos] tpos?
-> ^(OPLEM wordform tpos?);
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public class c2ps_opPROX
public static final int MLANG_GERMAN = 1;

public static int
messLang = MLANG_ENGLISH; // default.
messLang = MLANG_GERMAN; // default.

// type of an Error CommonToken:
final static int
Expand Down Expand Up @@ -53,9 +53,9 @@ private static String getErrMessEN(int errCode, String text)

case StatusCodes.ERR_PROX_WRONG_CHARS:
return String.format("Proximity operator at '%s': unknown proximity options!", text);
case StatusCodes.UNKNOWN_QUERY_ERROR:
return String.format("Unknown error!");

case StatusCodes.ERR_LEM_WILDCARDS:
return String.format("Lemma operator at '%s': wildcards (?*+) are not allowed inside a lemma.", text);

default:
return String.format("Proximity operator at '%s': unknown error. The correct syntax looks like this: E.g. ' /+w2 ' or ' /w10,s0 '.", text);
Expand Down Expand Up @@ -86,33 +86,15 @@ private static String getErrMessGE(int errCode, String text)
case StatusCodes.ERR_PROX_WRONG_CHARS:
return String.format("Abstandsoperator an der Stelle '%s': unbekannte Abstandsoption(en)!", text);

case StatusCodes.UNKNOWN_QUERY_ERROR:
return String.format("Unbekannter Fehler!");
case StatusCodes.ERR_LEM_WILDCARDS:
return String.format("Lemma-Suchbegriff an der Stelle '%s': Platzhalter (?*+) können im gesuchten Lemma nicht eingesetzt werden.", text);

default:
return String.format("Abstandsoperator an der Stelle '%s': unbekannter Fehler. Korrekte Syntax z.B.: ' /+w2 ' oder ' /w10,s0 '.", text);
}
}

private static String getErrMess(int errCode, int messLang, String text)

{
if( messLang == c2ps_opPROX.MLANG_GERMAN )
return getErrMessGE(errCode, text);
else
return getErrMessEN(errCode, text);
}


/**
* in this version, the pre-stored message language is used.
* @param errCode
* @param text
* @return
* 10.06.24/FB
*/

public static String getErrMess(int errCode, String text)
public static String getErrMess(int errCode, int messLang, String text)

{
if( messLang == c2ps_opPROX.MLANG_GERMAN )
Expand Down Expand Up @@ -183,7 +165,7 @@ public static Object encodeDIST(int typeDIST, int typeDIR, Object ctDir, Object
CommonTree tree3 = (CommonTree)ctVal;

if( bDebug )
System.err.printf("Debug: encodeDIST: scanned input='%s' countM=%d countD=%d countV=%d pos=%d.\n",
System.out.printf("Debug: encodeDIST: scanned input='%s' countM=%d countD=%d countV=%d pos=%d.\n",
text, countM, countD, countV, pos);

if( countM == 0 )
Expand All @@ -203,7 +185,7 @@ public static Object encodeDIST(int typeDIST, int typeDIR, Object ctDir, Object
treeDIR.addChild(treeBOTH);

if( bDebug )
System.err.printf("Debug: encodeDIST: tree for DIR: '%s'.\n", treeDIR.toStringTree());
System.out.printf("Debug: encodeDIST: tree for DIR: '%s'.\n", treeDIR.toStringTree());
tree1 = treeDIR;
}
else if( countD > 1 )
Expand All @@ -218,7 +200,7 @@ else if( countD > 1 )
tree.addChild(tree2);

if( bDebug )
System.err.printf("Debug: encodeDIST: returning '%s'.\n", tree.toStringTree());
System.out.printf("Debug: encodeDIST: returning '%s'.\n", tree.toStringTree());

return tree;
} // encodeDIST
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
public class c2ps_opREG

{
private static boolean DEBUG = false;
private static boolean DEBUG = true;

/*
* encode():
Expand Down
133 changes: 117 additions & 16 deletions src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opWF.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROXLexer;
import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROX;
import de.ids_mannheim.korap.query.serialize.util.StatusCodes;

/*
* parses prefixed and suffixed options of a search wordform.
Expand All @@ -11,6 +14,8 @@
public class c2ps_opWF

{
//static final int OPWF = 5; // must be same value than OPWF in c2ps_opWF.g
//static final int OPLEM = 7; // must be same value than OPLEM in c2ps_opWF.g
/* Arguments:
* bStrip: true: 'input' contains "wort" -> strip " away -> wort.
* false: 'input' contains no " -> nothing to strip.
Expand All @@ -19,9 +24,16 @@ public class c2ps_opWF
* input: may be a single Lemma or Wform or a list of Wforms.
*/

public static Tree check (String input, boolean bStrip, boolean bLem,
int index) {
if (bStrip)
public static Tree check (String input, boolean bStrip, boolean bLem, int pos)
{
if( bLem )
{
System.out.printf("c2ps_opWF.check: input='%s' bStrip=%b bLem=%b pos=%d.\n",
input, bStrip, bLem, pos);
System.out.flush();
}

if (bStrip)
input = input.substring(1, input.length() - 1);

if (bLem && input.charAt(0) == '&') {
Expand All @@ -43,18 +55,23 @@ public static Tree check (String input, boolean bStrip, boolean bLem,

try {
if (bLem)
c2PQLEMReturn = g.searchLEM();
c2PQLEMReturn = g.searchLEM(pos);
else
c2PQWFReturn = g.searchWFs();
c2PQWFReturn = g.searchWFs(pos);
}
catch (RecognitionException e) {
e.printStackTrace();
}

// AST Tree anzeigen:
Tree tree = bLem ? (Tree) c2PQLEMReturn.getTree() : (Tree) c2PQWFReturn
.getTree();
// System.out.println(bLem? "opLEM: " : "opWF: " + tree.toStringTree() );
Tree tree = bLem ? (Tree)c2PQLEMReturn.getTree() : (Tree)c2PQWFReturn.getTree();

if( bLem )
{
System.out.printf("c2ps_opWF.check: %s: '%s'.\n", bLem ? "opLEM" : "opWF",
tree.toStringTree() );
System.out.flush();
}

return tree;
}
Expand All @@ -63,24 +80,108 @@ public static Tree check (String input, boolean bStrip, boolean bLem,
/* Wordform Encoding, e.g. to insert a Wordform into an AST.
* a) wf -> "wf".
* b) remove escape char before ':': abc\: -> abc:.
* Returns a Tree.
* Args:
* wf : wordform or lemma (expected lemma : "lemma" or "opts&lemma",
* the starting '&' has been removed before entering this function).
* tokenType : either OPWF or OPLEM.
* pos : start position of wf.
* Notes:
* - &opts&lemma : may contain wildcards as options in the &opts& section only.
* reject if wildcards appear in the &lemma section.
* Returns a Tree or an ErrorTree.
*/
public static Tree encode (String wf, int tokenType)
public static Tree encode (String wf, int tokenType, int pos)

{
//System.out.printf("c2ps_opWF.encode: wf='%s' tokenType=%d pos=%d.\n", wf, tokenType, pos);

// b)
StringBuffer sbWF = new StringBuffer(wf);

for (int i = 0; i < sbWF.length() - 1; i++) {
for (int i = 0; i < sbWF.length()-1; i++)
{
if (sbWF.charAt(i) == '\\' && sbWF.charAt(i + 1) == ':')
sbWF.deleteCharAt(i);
}

return new CommonTree(new CommonToken(tokenType, "\"" + sbWF.toString()
+ "\""));
}

if( tokenType == c2ps_opWFLexer.OPLEM )
{
boolean hasOpts = false; // true if a '&' occurs: e.g. "Fes+C&lemma"
boolean hasFound = false; // false for all wildcards found to the left of '&', true in all other cases.

for(int i=0; i< sbWF.length(); i++)
{
if( sbWF.charAt(i) == '&' )
{
hasOpts = true;
hasFound = false;
}
else if (sbWF.charAt(i) == '?' || sbWF.charAt(i) == '*' || sbWF.charAt(i) == '+' )
{
hasFound = true;
}
}

// error if hasFound==true:
if( hasFound )
{
System.out.printf("c2ps_opWF.encode: Syntax error: '%s' contains wildcards inside lemma expression!\n", wf);
return buildErrorTree(wf, StatusCodes.ERR_LEM_WILDCARDS, pos);
}
}

return new CommonTree(new CommonToken(tokenType, "\"" + sbWF.toString() + "\""));
}


/**
* buildErrorTree():
* @param text = part of the query that contains an error.
* @param errCode
* @param typeDIST
* @param pos
* @return
*/

//private static CommonTree buildErrorTree(String text, int errCode, int typeDIST, int pos)

private static CommonTree buildErrorTree(String text, int errCode, int pos)
{
/*
CommonTree
//errorTree = new CommonTree(new CommonToken(typeDIST, "DIST"));
errorTree = new CommonTree(new CommonToken(c2ps_opPROX.typeERROR, "Fehlercherchen"));
*/
CommonTree
errorNode = new CommonTree(new CommonToken(c2ps_opPROX.typeERROR, "ERROR"));
CommonTree
errorPos = new CommonTree(new CommonToken(c2ps_opPROX.typeERROR, String.valueOf(pos)));
CommonTree
errorCode = new CommonTree(new CommonToken(c2ps_opPROX.typeERROR, String.valueOf(errCode)));
CommonTree
errorMes;
String
mess;

mess = c2ps_opPROX.getErrMess(errCode, c2ps_opPROX.messLang, text);
errorMes = new CommonTree(new CommonToken(c2ps_opPROX.typeERROR, mess));

// new:
errorNode.addChild(errorPos);
errorNode.addChild(errorCode);
errorNode.addChild(errorMes);

return errorNode;

/* old, no need for errorTree(typeXY).
errorTree.addChild(errorNode);
errorNode.addChild(errorPos);
errorNode.addChild(errorCode);
errorNode.addChild(errorMes);
return errorTree;
*/
}

/*
* main testprogram:
*/
Expand Down
Loading

0 comments on commit 94b2fa6

Please sign in to comment.