Coverage Report - org.galagosearch.core.retrieval.query.StructuredQuery
 
Classes in this File Line Coverage Branch Coverage Complexity
StructuredQuery
90%
156/173
75%
86/114
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.retrieval.query;
 3  
 
 4  
 import java.util.ArrayList;
 5  
 import java.util.HashSet;
 6  
 import java.util.List;
 7  
 import java.util.Set;
 8  
 import org.galagosearch.tupleflow.Parameters;
 9  
 
 10  
 /**
 11  
  * Valid query language syntax:
 12  
  *
 13  
  * #operator:argument(...)
 14  
  * term, or term.field, or term.field.field, etc.
 15  
  *
 16  
  * @author trevor
 17  
  */
 18  4
 public class StructuredQuery {
 19  
     /**
 20  
      * Copies a query tree using a traversal object.
 21  
      *
 22  
      * Both traversal.beforeNode and traversal.afterNode will be called
 23  
      * for every Node object in the query tree.
 24  
      * The traversal.beforeNode method will be called with a parent node
 25  
      * before any of its children (pre-order), while traversal.afterNode method
 26  
      * will be called on the parent after all of its children (post-order).
 27  
      *
 28  
      * The afterNode method will be called with a new copy of the original
 29  
      * node, with the children replaced by new copies.  afterNode can either
 30  
      * return its parameter or a modified node.
 31  
      */
 32  
     public static Node copy(Traversal traversal, Node tree) throws Exception {
 33  84
         ArrayList<Node> children = new ArrayList<Node>();
 34  84
         traversal.beforeNode(tree);
 35  
 
 36  84
         for (Node n : tree.getInternalNodes()) {
 37  60
             Node child = copy(traversal, n);
 38  60
             children.add(child);
 39  60
         }
 40  
 
 41  84
         Node newNode = new Node(tree.getOperator(), tree.getParameters(),
 42  
                 children, tree.getPosition());
 43  84
         return traversal.afterNode(newNode);
 44  
     }
 45  
 
 46  
     /**
 47  
      * Walks a query tree with a traversal object.
 48  
      *
 49  
      * Both traversal.beforeNode and traversal.afterNode will be called
 50  
      * for every Node object in the query tree.
 51  
      * The traversal.beforeNode method will be called with a parent node
 52  
      * before any of its children (pre-order), while traversal.afterNode method
 53  
      * will be called on the parent after all of its children (post-order).
 54  
      */
 55  
     public static void walk(Traversal traversal, Node tree) throws Exception {
 56  16
         traversal.beforeNode(tree);
 57  16
         for (Node n : tree.getInternalNodes()) {
 58  12
             walk(traversal, n);
 59  
         }
 60  16
         traversal.afterNode(tree);
 61  16
     }
 62  
 
 63  
     /**
 64  
      * Splits an input string, which may include escapes,
 65  
      * into chunks based on a delimiter character.  It does not split
 66  
      * on delimiters that appear in escaped regions.
 67  
      */
 68  
     public static ArrayList<String> splitOn(String text, char delimiter) {
 69  48
         int start = 0;
 70  48
         ArrayList<String> result = new ArrayList<String>();
 71  
 
 72  280
         for (int i = 0; i < text.length(); i++) {
 73  232
             char c = text.charAt(i);
 74  
 
 75  232
             if (c == '@') {
 76  0
                 i = StructuredQuery.findEscapedEnd(text, i);
 77  232
             } else if (c == delimiter) {
 78  4
                 result.add(text.substring(start, i));
 79  4
                 start = i + 1;
 80  
             }
 81  
         }
 82  
 
 83  48
         if (start != text.length()) {
 84  48
             result.add(text.substring(start));
 85  
         }
 86  
 
 87  48
         return result;
 88  
     }
 89  
 
 90  
     /**
 91  
      * <p>Parses an operator from the string query.  This method assumes that
 92  
      * operator is a pound sign ('#') followed by some text, followed by an open 
 93  
      * parentheses.  Parsing stops at the parenthesis.</p>
 94  
      *
 95  
      * <p>Note that parsing starts at index 0, not at "offset".  The offset is used purely
 96  
      * for giving parse error information, and represents the offset of the operator string
 97  
      * in the larger query string.</p>
 98  
      */
 99  
     public static Node parseOperator(String operator, int offset) {
 100  44
         assert operator.charAt(0) == '#';
 101  44
         int firstParen = operator.indexOf('(');
 102  
 
 103  44
         if (firstParen < 0) {
 104  0
             return new Node("unknown", new ArrayList<Node>(), offset);
 105  
         }
 106  
 
 107  44
         String operatorText = operator.substring(0, firstParen);
 108  44
         ArrayList<String> operatorParts = splitOn(operatorText.substring(1), ':');
 109  
 
 110  44
         String operatorName = operatorParts.get(0);
 111  44
         Parameters parameters = new Parameters();
 112  
 
 113  44
         for (String part : operatorParts.subList(1, operatorParts.size())) {
 114  4
             ArrayList<String> keyValue = splitOn(part, '=');
 115  
 
 116  4
             if (keyValue.size() == 1) {
 117  4
                 parameters.add("default", decodeEscapes(part));
 118  0
             } else if (keyValue.size() > 1) {
 119  0
                 String key = keyValue.get(0);
 120  0
                 String value = keyValue.get(1);
 121  0
                 parameters.add(decodeEscapes(key), decodeEscapes(value));
 122  
             }
 123  4
         }
 124  
 
 125  44
         int endOperator = operator.length();
 126  
 
 127  44
         if (operator.charAt(operator.length() - 1) == ')') {
 128  44
             endOperator--;
 129  
         }
 130  
 
 131  44
         ArrayList<Node> children = parseArguments(operator.substring(firstParen + 1, endOperator),
 132  
                                                   offset + firstParen + 1);
 133  44
         Node result = new Node(operatorName, parameters, children, offset);
 134  44
         return result;
 135  
     }
 136  
 
 137  
     /**
 138  
      * Find the end of an escaped query region.
 139  
      * We assume that query.charAt(start) == '@'.  The
 140  
      * next charater is the boundary symbol for the escaped text.
 141  
      * We move forward in the string until we see the boundary symbol again.
 142  
      * If the escaped region isn't well formed (that is, there is no 
 143  
      * initial boundary symbol, or we only see the boundary symbol once),
 144  
      * query.length() is returned.
 145  
      */
 146  
     public static int findEscapedEnd(String query, int start) {
 147  32
         assert query.charAt(start) == '@';
 148  
 
 149  
         // guard against the error case
 150  32
         if (query.length() < start + 2) {
 151  0
             return query.length();
 152  
         }
 153  32
         char doneChar = query.charAt(start + 1);
 154  32
         int result = query.indexOf(doneChar, start + 2);
 155  
 
 156  32
         if (result < 0) {
 157  0
             return query.length();
 158  
         }
 159  32
         return result;
 160  
     }
 161  
 
 162  
     /**
 163  
      * <p>Find the end of an operator.  We assume that query.charAt(start)
 164  
      * is a pound sign.  We skip forward in the query looking for
 165  
      * the end of the operator by looking at parentheses; we know we're
 166  
      * done when the parentheses are balanced and we've seen at least
 167  
      * one open parenthesis.  This method skips over escaped regions.</p>
 168  
      */
 169  
     public static int findOperatorEnd(String query, int start) {
 170  88
         int i = start;
 171  88
         int open = 0;
 172  88
         int closed = 0;
 173  
 
 174  2776
         for (; i < query.length() && (open != closed || open == 0); i++) {
 175  1344
             char current = query.charAt(i);
 176  
 
 177  1344
             if (current == '(') {
 178  96
                 open++;
 179  1248
             } else if (current == ')') {
 180  96
                 closed++;
 181  1152
             } else if (current == '@') {
 182  16
                 i = findEscapedEnd(query, i);
 183  
             }
 184  
         }
 185  
 
 186  88
         return i;
 187  
     }
 188  
 
 189  
     public static int findTextEnd(String query, int start) {
 190  184
         int i = start;
 191  
 
 192  840
         for (; i < query.length(); i++) {
 193  396
             char current = query.charAt(i);
 194  
 
 195  396
             if (Character.isWhitespace(current)) {
 196  68
                 break;
 197  328
             } else if (current == '@') {
 198  8
                 i = findEscapedEnd(query, i);
 199  
             }
 200  
         }
 201  
 
 202  184
         return i;
 203  
     }
 204  
 
 205  
     public static String decodeEscapes(String escapedString) {
 206  100
         StringBuilder builder = new StringBuilder();
 207  100
         char escapeChar = ' ';
 208  100
         boolean inEscape = false;
 209  
 
 210  288
         for (int i = 0; i < escapedString.length(); i++) {
 211  188
             char current = escapedString.charAt(i);
 212  
 
 213  188
             if (!inEscape && current == '@' && i + 1 < escapedString.length()) {
 214  8
                 escapeChar = escapedString.charAt(i + 1);
 215  8
                 inEscape = true;
 216  8
                 i += 1;
 217  180
             } else if (inEscape && current == escapeChar) {
 218  8
                 inEscape = false;
 219  
             } else {
 220  172
                 builder.append(escapedString.charAt(i));
 221  
             }
 222  
         }
 223  
 
 224  100
         return builder.toString();
 225  
     }
 226  
 
 227  
     public static ArrayList<String> splitStringRespectingEscapes(String query, char split) {
 228  156
         ArrayList<String> chunks = new ArrayList<String>();
 229  156
         int start = 0;
 230  
 
 231  536
         for (int i = 0; i < query.length(); i++) {
 232  380
             char current = query.charAt(i);
 233  
 
 234  380
             if (current == split) {
 235  52
                 chunks.add(query.substring(start, i));
 236  52
                 start = i + 1;
 237  328
             } else if (current == '@') {
 238  8
                 i = findEscapedEnd(query, i);
 239  
             }
 240  
         }
 241  
 
 242  156
         if (start < query.length() || query.length() == 0) {
 243  156
             chunks.add(query.substring(start));
 244  
         }
 245  
 
 246  156
         return chunks;
 247  
     }
 248  
     
 249  
     public static Node fieldOrNode(ArrayList<String> fieldNames, int offset) {
 250  52
         assert fieldNames.size() > 0 : "Can't make an or node with no fields";
 251  
         Node result;
 252  
         
 253  52
         if (fieldNames.size() == 1) {
 254  44
             result = new Node("field", fieldNames.get(0), offset);
 255  
         } else {
 256  8
             ArrayList<Node> children = new ArrayList<Node>();
 257  
             
 258  24
             for (int i = 0; i < fieldNames.size(); ++i) {
 259  16
                 children.add(new Node("field", fieldNames.get(i), offset));
 260  
             }
 261  
             
 262  8
             result = new Node("extentor", children, offset);
 263  
         }
 264  
         
 265  52
         return result;
 266  
     }
 267  
 
 268  
     public static Node parseTerm(String query, int offset) {
 269  
         // step 1, split at periods
 270  96
         ArrayList<String> chunks = splitStringRespectingEscapes(query, '.');
 271  
         
 272  
         // step 2, decode each chunk.  The first chunk is the term text,
 273  
         // while all remaining chunks describe fields.
 274  96
         Node result = new Node("text", decodeEscapes(chunks.get(0)), offset);
 275  96
         result = parseFields(result, chunks.subList(1, chunks.size()), offset);
 276  
 
 277  96
         return result;
 278  
     }
 279  
 
 280  
     public static int findOperatorExpressionEnd(String query, int offset) {
 281  
         // this is an operator, so scan for balanced parentheses,
 282  
         // not including escaped regions
 283  44
         int end = findOperatorEnd(query, offset);
 284  44
         int textEnd = findTextEnd(query, end);
 285  
 
 286  44
         if (end != textEnd && query.charAt(end) == '.')
 287  8
             return textEnd;
 288  
 
 289  36
         return end;
 290  
     }
 291  
 
 292  
     /**
 293  
      * Parses arguments to an operator.  This method can also be used to parse
 294  
      * the elements of a query, since terms in a query are implicitly in a #combine
 295  
      * operator.
 296  
      * 
 297  
      * @param query The query, or a substring of it, that contains argument text.
 298  
      * @param offset The offset into the full query string (offset == 0 if query is the full query).
 299  
      * @return a List of Nodes, which represent the query.
 300  
      */
 301  
     public static ArrayList<Node> parseArguments(String query, int offset) {
 302  96
         ArrayList<Node> arguments = new ArrayList<Node>();
 303  96
         int start = 0;
 304  96
         boolean inElement = false;
 305  
 
 306  
         // scan the string, looking for tokens.  Tokens are either operators (#\w+(...)), words, or escaped.
 307  244
         for (int i = 0; i < query.length(); i++) {
 308  148
             char current = query.charAt(i);
 309  
 
 310  148
             if (!Character.isWhitespace(current)) {
 311  140
                 if (current == '#') {
 312  44
                     Node child = parseOperatorExpression(query, i, offset);
 313  44
                     arguments.add(child);
 314  44
                     i = findOperatorExpressionEnd(query, i);
 315  44
                 } else {
 316  96
                     int end = findTextEnd(query, i);
 317  96
                     Node child = parseTerm(query.substring(i, end), offset + i);
 318  96
                     arguments.add(child);
 319  96
                     i = end;
 320  
                 }
 321  
             }
 322  
         }
 323  
 
 324  
         // we're at the end of the string
 325  96
         if (inElement) {
 326  0
             Node child = new Node("text", query.substring(start), offset + start);
 327  0
             arguments.add(child);
 328  
         }
 329  
 
 330  96
         return arguments;
 331  
     }
 332  
 
 333  
     public static Node parse(String query) {
 334  52
         ArrayList<Node> arguments = parseArguments(query, 0);
 335  
 
 336  52
         if (query.length() == 0) {
 337  0
             return new Node("text", "");
 338  52
         } else if (arguments.size() == 1) {
 339  48
             return arguments.get(0);
 340  
         } else {
 341  4
             return new Node("combine", arguments, 0);
 342  
         }
 343  
     }
 344  
 
 345  
     public static Set<String> findQueryTerms(Node queryTree) {
 346  0
         HashSet<String> queryTerms = new HashSet<String>();
 347  
 
 348  0
         if (queryTree.getOperator().equals("text")) {
 349  0
             queryTerms.add(queryTree.getDefaultParameter());
 350  
         } else {
 351  0
             for (Node child : queryTree.getInternalNodes()) {
 352  0
                 queryTerms.addAll(findQueryTerms(child));
 353  
             }
 354  
         }
 355  
 
 356  0
         return queryTerms;
 357  
     }
 358  
 
 359  
     private static Node parseFields(Node result, List<String> chunks, int offset) {
 360  156
         for (int i = 0; i < chunks.size(); i++) {
 361  52
             ArrayList<Node> children = new ArrayList<Node>();
 362  52
             children.add(result);
 363  52
             String fieldText = chunks.get(i);
 364  52
             boolean isSmoothingField = false;
 365  52
             if (fieldText.startsWith("(") && fieldText.endsWith(")")) {
 366  12
                 isSmoothingField = true;
 367  12
                 fieldText = fieldText.substring(1, fieldText.length() - 1);
 368  
             }
 369  52
             ArrayList<String> fieldNames = splitStringRespectingEscapes(fieldText, ',');
 370  52
             Node fieldNode = fieldOrNode(fieldNames, offset);
 371  52
             children.add(fieldNode);
 372  52
             if (isSmoothingField) {
 373  12
                 result = new Node("smoothinside", children, offset);
 374  
             } else {
 375  40
                 result = new Node("inside", children, offset);
 376  
             }
 377  
         }
 378  104
         return result;
 379  
     }
 380  
 
 381  
     private static Node parseOperatorExpression(String query, int i, int offset) {
 382  
         // this is an operator, so scan for balanced parentheses,
 383  
         // not including escaped regions
 384  44
         int end = findOperatorEnd(query, i);
 385  44
         Node child = parseOperator(query.substring(i, end), offset + i);
 386  
         // operators may end with a field expression, so parse that too:
 387  44
         int textEnd = findTextEnd(query, end);
 388  44
         if (textEnd != end && query.charAt(end) == '.') {
 389  8
             String remainingText = query.substring(end + 1, textEnd);
 390  8
             ArrayList<String> chunks = splitStringRespectingEscapes(remainingText, '.');
 391  8
             child = parseFields(child, chunks, end);
 392  
         }
 393  44
         return child;
 394  
     }
 395  
 }