X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fmain%2Fjava%2Forg%2Fz3950%2Fzing%2Fcql%2FCQLParser.java;h=8a0fc172592a572a6cf25e9802009a59414d5905;hb=738438f3da478a1d0121cbbdab620c47f222c73e;hp=7580959cf328718a342e0c9e3df2fff87e8b6c3d;hpb=7d8ec6eaa2b799a2c466605af660e381644c48cc;p=cql-java-moved-to-github.git diff --git a/src/main/java/org/z3950/zing/cql/CQLParser.java b/src/main/java/org/z3950/zing/cql/CQLParser.java index 7580959..8a0fc17 100644 --- a/src/main/java/org/z3950/zing/cql/CQLParser.java +++ b/src/main/java/org/z3950/zing/cql/CQLParser.java @@ -1,27 +1,33 @@ -// $Id: CQLParser.java,v 1.39 2007-08-06 15:54:48 mike Exp $ package org.z3950.zing.cql; + +import java.io.BufferedReader; import java.io.IOException; -import java.util.Vector; import java.util.Properties; import java.io.InputStream; import java.io.FileInputStream; -import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** * Compiles CQL strings into parse trees of CQLNode subtypes. * - * @version $Id: CQLParser.java,v 1.39 2007-08-06 15:54:48 mike Exp $ * @see http://zing.z3950.org/cql/index.html */ public class CQLParser { - private CQLLexer lexer; - private int compat; // When false, implement CQL 1.2 - public static int V1POINT1 = 12368; - public static int V1POINT2 = 12369; - public static int V1POINT1SORT = 12370; + private CQLTokenizer lexer; + private final int compat; // When false, implement CQL 1.2 + private final Set customRelations = new HashSet(); + + public static final int V1POINT1 = 12368; + public static final int V1POINT2 = 12369; + public static final int V1POINT1SORT = 12370; + public final boolean allowKeywordTerms; static private boolean DEBUG = false; static private boolean LEXDEBUG = false; @@ -41,19 +47,52 @@ public class CQLParser { */ public CQLParser(int compat) { this.compat = compat; + this.allowKeywordTerms = true; } - + + /** + * Official CQL grammar allows registered keywords like 'and/or/not/sortby/prox' + * to be used unquoted in terms. This constructor allows to create an instance + * of a parser that prohibits this behavior while sacrificing compatibility. + * @param compat CQL version compatibility + * @param allowKeywordTerms when false registered keywords are disallowed in unquoted terms + */ + public CQLParser(int compat, boolean allowKeywordTerms) { + this.compat = compat; + this.allowKeywordTerms = allowKeywordTerms; + } + /** * The new parser implements CQL 1.2 */ public CQLParser() { this.compat = V1POINT2; + this.allowKeywordTerms = true; } private static void debug(String str) { if (DEBUG) System.err.println("PARSEDEBUG: " + str); } + + /** + * Registers custom relation in this parser. Note that when a custom relation + * is registered the parser is no longer strictly compliant with the chosen spec. + * @param relation + * @return true if custom relation has not been registered already + */ + public boolean registerCustomRelation(String relation) { + return customRelations.add(relation); + } + + /** + * Unregisters previously registered custom relation in this instance of the parser. + * @param relation + * @return true is relation has been previously registered + */ + public boolean unregisterCustomRelation(String relation) { + return customRelations.remove(relation); + } /** * Compiles a CQL query. @@ -75,12 +114,13 @@ public class CQLParser { throws CQLParseException, IOException { lexer = new CQLLexer(cql, LEXDEBUG); - lexer.nextToken(); + lexer.move(); debug("about to parseQuery()"); CQLNode root = parseTopLevelPrefixes("cql.serverChoice", new CQLRelation(compat == V1POINT2 ? "=" : "scr")); - if (lexer.ttype != lexer.TT_EOF) - throw new CQLParseException("junk after end: " + lexer.render()); + if (lexer.what() != CQLTokenizer.TT_EOF) + throw new CQLParseException("junk after end: " + lexer.render(), + lexer.pos()); return root; } @@ -89,25 +129,25 @@ public class CQLParser { throws CQLParseException, IOException { debug("top-level prefix mapping"); - if (lexer.ttype == '>') { + if (lexer.what() == '>') { return parsePrefix(index, relation, true); } CQLNode node = parseQuery(index, relation); if ((compat == V1POINT2 || compat == V1POINT1SORT) && - lexer.ttype == lexer.TT_SORTBY) { - match(lexer.ttype); + lexer.what() == CQLTokenizer.TT_SORTBY) { + match(lexer.what()); debug("sortspec"); CQLSortNode sortnode = new CQLSortNode(node); - while (lexer.ttype != lexer.TT_EOF) { + while (lexer.what() != CQLTokenizer.TT_EOF) { String sortindex = matchSymbol("sort index"); ModifierSet ms = gatherModifiers(sortindex); sortnode.addSortIndex(ms); } if (sortnode.keys.size() == 0) { - throw new CQLParseException("no sort keys"); + throw new CQLParseException("no sort keys", lexer.pos()); } node = sortnode; @@ -121,25 +161,25 @@ public class CQLParser { debug("in parseQuery()"); CQLNode term = parseTerm(index, relation); - while (lexer.ttype != lexer.TT_EOF && - lexer.ttype != ')' && - lexer.ttype != lexer.TT_SORTBY) { - if (lexer.ttype == lexer.TT_AND || - lexer.ttype == lexer.TT_OR || - lexer.ttype == lexer.TT_NOT || - lexer.ttype == lexer.TT_PROX) { - int type = lexer.ttype; - String val = lexer.sval; + while (lexer.what() != CQLTokenizer.TT_EOF && + lexer.what() != ')' && + lexer.what() != CQLTokenizer.TT_SORTBY) { + if (lexer.what() == CQLTokenizer.TT_AND || + lexer.what() == CQLTokenizer.TT_OR || + lexer.what() == CQLTokenizer.TT_NOT || + lexer.what() == CQLTokenizer.TT_PROX) { + int type = lexer.what(); + String val = lexer.value(); match(type); ModifierSet ms = gatherModifiers(val); CQLNode term2 = parseTerm(index, relation); - term = ((type == lexer.TT_AND) ? new CQLAndNode(term, term2, ms) : - (type == lexer.TT_OR) ? new CQLOrNode (term, term2, ms) : - (type == lexer.TT_NOT) ? new CQLNotNode(term, term2, ms) : + term = ((type == CQLTokenizer.TT_AND) ? new CQLAndNode(term, term2, ms) : + (type == CQLTokenizer.TT_OR) ? new CQLOrNode (term, term2, ms) : + (type == CQLTokenizer.TT_NOT) ? new CQLNotNode(term, term2, ms) : new CQLProxNode(term, term2, ms)); } else { throw new CQLParseException("expected boolean, got " + - lexer.render()); + lexer.render(), lexer.pos()); } } @@ -152,20 +192,21 @@ public class CQLParser { debug("in gatherModifiers()"); ModifierSet ms = new ModifierSet(base); - while (lexer.ttype == '/') { + while (lexer.what() == '/') { match('/'); - if (lexer.ttype != lexer.TT_WORD) + if (lexer.what() != CQLTokenizer.TT_WORD) throw new CQLParseException("expected modifier, " - + "got " + lexer.render()); - String type = lexer.sval.toLowerCase(); - match(lexer.ttype); + + "got " + lexer.render(), + lexer.pos()); + String type = lexer.value().toLowerCase(); + match(lexer.what()); if (!isSymbolicRelation()) { // It's a simple modifier consisting of type only ms.addModifier(type); } else { // It's a complex modifier of the form type=value - String comparision = lexer.render(lexer.ttype, false); - match(lexer.ttype); + String comparision = lexer.render(lexer.what(), false); + match(lexer.what()); String value = matchSymbol("modifier value"); ms.addModifier(type, comparision, value); } @@ -178,35 +219,47 @@ public class CQLParser { throws CQLParseException, IOException { debug("in parseTerm()"); - String word; + String first; + StringBuilder all; while (true) { - if (lexer.ttype == '(') { + if (lexer.what() == '(') { debug("parenthesised term"); match('('); CQLNode expr = parseQuery(index, relation); match(')'); return expr; - } else if (lexer.ttype == '>') { + } else if (lexer.what() == '>') { return parsePrefix(index, relation, false); } debug("non-parenthesised term"); - word = matchSymbol("index or term"); - if (!isSymbolicRelation() && lexer.ttype != lexer.TT_WORD) - break; - - index = word; - String relstr = (lexer.ttype == lexer.TT_WORD ? - lexer.sval : lexer.render(lexer.ttype, false)); + first = matchSymbol("index or term"); + all = new StringBuilder(first); + //match relation only on second postion + while (isWordOrString() && (all.length() > first.length() || !isRelation())) { + all.append(" ").append(lexer.value()); + match(lexer.what()); + } + + if (!isRelation()) + break; //we're done if no relation + + //we have relation, but it only makes sense if preceded by a single term + if (all.length() > first.length()) { + throw new CQLParseException("unexpected relation '"+lexer.value()+"'" + , lexer.pos()); + } + index = first; + String relstr = (lexer.what() == CQLTokenizer.TT_WORD ? + lexer.value() : lexer.render(lexer.what(), false)); relation = new CQLRelation(relstr); - match(lexer.ttype); + match(lexer.what()); ModifierSet ms = gatherModifiers(relstr); - relation.setModifiers(ms); + relation.ms = ms; debug("index='" + index + ", " + "relation='" + relation.toCQL() + "'"); } - - CQLTermNode node = new CQLTermNode(index, relation, word); + CQLTermNode node = new CQLTermNode(index, relation, all.toString()); debug("made term node " + node.toCQL()); return node; } @@ -219,7 +272,7 @@ public class CQLParser { match('>'); String name = null; String identifier = matchSymbol("prefix-name"); - if (lexer.ttype == '=') { + if (lexer.what() == '=') { match('='); name = identifier; identifier = matchSymbol("prefix-identifer"); @@ -230,58 +283,76 @@ public class CQLParser { return new CQLPrefixNode(name, identifier, node); } + + private boolean isWordOrString() { + return CQLTokenizer.TT_WORD == lexer.what() + || CQLTokenizer.TT_STRING == lexer.what(); + } + + private boolean isRelation() { + debug("isRelation: checking what()=" + lexer.what() + + " (" + lexer.render() + ")"); + if (lexer.what() == CQLTokenizer.TT_WORD && + (lexer.value().indexOf('.') >= 0 || + lexer.value().equals("any") || + lexer.value().equals("all") || + lexer.value().equals("within") || + lexer.value().equals("encloses") || + (lexer.value().equals("exact") && compat != V1POINT2) || + (lexer.value().equals("scr") && compat != V1POINT2) || + (lexer.value().equals("adj") && compat == V1POINT2) || + customRelations.contains(lexer.value()))) + return true; + + return isSymbolicRelation(); + } - // Checks for a relation private boolean isSymbolicRelation() { - debug("isSymbolicRelation: checking ttype=" + lexer.ttype + + debug("isSymbolicRelation: checking what()=" + lexer.what() + " (" + lexer.render() + ")"); - return (lexer.ttype == '<' || - lexer.ttype == '>' || - lexer.ttype == '=' || - lexer.ttype == lexer.TT_LE || - lexer.ttype == lexer.TT_GE || - lexer.ttype == lexer.TT_NE || - lexer.ttype == lexer.TT_EQEQ); + return (lexer.what() == '<' || + lexer.what() == '>' || + lexer.what() == '=' || + lexer.what() == CQLTokenizer.TT_LE || + lexer.what() == CQLTokenizer.TT_GE || + lexer.what() == CQLTokenizer.TT_NE || + lexer.what() == CQLTokenizer.TT_EQEQ); } private void match(int token) throws CQLParseException, IOException { debug("in match(" + lexer.render(token, true) + ")"); - if (lexer.ttype != token) + if (lexer.what() != token) throw new CQLParseException("expected " + lexer.render(token, true) + - ", " + "got " + lexer.render()); - int tmp = lexer.nextToken(); - debug("match() got token=" + lexer.ttype + ", " + - "nval=" + lexer.nval + ", sval='" + lexer.sval + "'" + - " (tmp=" + tmp + ")"); + ", " + "got " + lexer.render(), + lexer.pos()); + lexer.move(); + debug("match() got token=" + lexer.what() + ", value()='" + lexer.value() + "'"); } private String matchSymbol(String expected) throws CQLParseException, IOException { debug("in matchSymbol()"); - if (lexer.ttype == lexer.TT_WORD || - lexer.ttype == lexer.TT_NUMBER || - lexer.ttype == '"' || + if (lexer.what() == CQLTokenizer.TT_WORD || + lexer.what() == CQLTokenizer.TT_STRING || // The following is a complete list of keywords. Because // they're listed here, they can be used unquoted as // indexes, terms, prefix names and prefix identifiers. - // ### Instead, we should ask the lexer whether what we - // have is a keyword, and let the knowledge reside there. - lexer.ttype == lexer.TT_AND || - lexer.ttype == lexer.TT_OR || - lexer.ttype == lexer.TT_NOT || - lexer.ttype == lexer.TT_PROX || - lexer.ttype == lexer.TT_SORTBY) { - String symbol = (lexer.ttype == lexer.TT_NUMBER) ? - lexer.render() : lexer.sval; - match(lexer.ttype); + (allowKeywordTerms && + lexer.what() == CQLTokenizer.TT_AND || + lexer.what() == CQLTokenizer.TT_OR || + lexer.what() == CQLTokenizer.TT_NOT || + lexer.what() == CQLTokenizer.TT_PROX || + lexer.what() == CQLTokenizer.TT_SORTBY)) { + String symbol = lexer.value(); + match(lexer.what()); return symbol; } throw new CQLParseException("expected " + expected + ", " + - "got " + lexer.render()); + "got " + lexer.render(), lexer.pos()); } @@ -348,7 +419,7 @@ public class CQLParser { char mode = 'x'; // x=XCQL, c=CQL, p=PQF String pfile = null; - Vector argv = new Vector(); + List argv = new ArrayList(); for (int i = 0; i < args.length; i++) { argv.add(args[i]); } @@ -385,47 +456,59 @@ public class CQLParser { if (argv.size() == 1) { cql = (String) argv.get(0); } else { - byte[] bytes = new byte[10000]; + BufferedReader buff = new BufferedReader(new InputStreamReader(System.in)); try { - // Read in the whole of standard input in one go - int nbytes = System.in.read(bytes); + // read a single line of input + cql = buff.readLine(); + if (cql == null) { + System.err.println("Can't read query from stdin"); + System.exit(2); + return; + } } catch (IOException ex) { System.err.println("Can't read query: " + ex.getMessage()); System.exit(2); + return; } - cql = new String(bytes); } CQLParser parser = new CQLParser(compat); - CQLNode root = null; + CQLNode root; try { root = parser.parse(cql); } catch (CQLParseException ex) { System.err.println("Syntax error: " + ex.getMessage()); + StringBuilder space = new StringBuilder(cql.length()); + System.out.println(cql); + for (int i=0; i