View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.retrieval.query;
3   
4   import java.util.ArrayList;
5   import java.util.HashSet;
6   import java.util.List;
7   import java.util.Set;
8   import org.galagosearch.tupleflow.Parameters;
9   
10  /***
11   * Valid query language syntax:
12   *
13   * #operator:argument(...)
14   * term, or term.field, or term.field.field, etc.
15   *
16   * @author trevor
17   */
18  public class StructuredQuery {
19      /***
20       * Copies a query tree using a traversal object.
21       *
22       * Both traversal.beforeNode and traversal.afterNode will be called
23       * for every Node object in the query tree.
24       * The traversal.beforeNode method will be called with a parent node
25       * before any of its children (pre-order), while traversal.afterNode method
26       * will be called on the parent after all of its children (post-order).
27       *
28       * The afterNode method will be called with a new copy of the original
29       * node, with the children replaced by new copies.  afterNode can either
30       * return its parameter or a modified node.
31       */
32      public static Node copy(Traversal traversal, Node tree) throws Exception {
33          ArrayList<Node> children = new ArrayList<Node>();
34          traversal.beforeNode(tree);
35  
36          for (Node n : tree.getInternalNodes()) {
37              Node child = copy(traversal, n);
38              children.add(child);
39          }
40  
41          Node newNode = new Node(tree.getOperator(), tree.getParameters(),
42                  children, tree.getPosition());
43          return traversal.afterNode(newNode);
44      }
45  
46      /***
47       * Walks a query tree with a traversal object.
48       *
49       * Both traversal.beforeNode and traversal.afterNode will be called
50       * for every Node object in the query tree.
51       * The traversal.beforeNode method will be called with a parent node
52       * before any of its children (pre-order), while traversal.afterNode method
53       * will be called on the parent after all of its children (post-order).
54       */
55      public static void walk(Traversal traversal, Node tree) throws Exception {
56          traversal.beforeNode(tree);
57          for (Node n : tree.getInternalNodes()) {
58              walk(traversal, n);
59          }
60          traversal.afterNode(tree);
61      }
62  
63      /***
64       * Splits an input string, which may include escapes,
65       * into chunks based on a delimiter character.  It does not split
66       * on delimiters that appear in escaped regions.
67       */
68      public static ArrayList<String> splitOn(String text, char delimiter) {
69          int start = 0;
70          ArrayList<String> result = new ArrayList<String>();
71  
72          for (int i = 0; i < text.length(); i++) {
73              char c = text.charAt(i);
74  
75              if (c == '@') {
76                  i = StructuredQuery.findEscapedEnd(text, i);
77              } else if (c == delimiter) {
78                  result.add(text.substring(start, i));
79                  start = i + 1;
80              }
81          }
82  
83          if (start != text.length()) {
84              result.add(text.substring(start));
85          }
86  
87          return result;
88      }
89  
90      /***
91       * <p>Parses an operator from the string query.  This method assumes that
92       * operator is a pound sign ('#') followed by some text, followed by an open 
93       * parentheses.  Parsing stops at the parenthesis.</p>
94       *
95       * <p>Note that parsing starts at index 0, not at "offset".  The offset is used purely
96       * for giving parse error information, and represents the offset of the operator string
97       * in the larger query string.</p>
98       */
99      public static Node parseOperator(String operator, int offset) {
100         assert operator.charAt(0) == '#';
101         int firstParen = operator.indexOf('(');
102 
103         if (firstParen < 0) {
104             return new Node("unknown", new ArrayList<Node>(), offset);
105         }
106 
107         String operatorText = operator.substring(0, firstParen);
108         ArrayList<String> operatorParts = splitOn(operatorText.substring(1), ':');
109 
110         String operatorName = operatorParts.get(0);
111         Parameters parameters = new Parameters();
112 
113         for (String part : operatorParts.subList(1, operatorParts.size())) {
114             ArrayList<String> keyValue = splitOn(part, '=');
115 
116             if (keyValue.size() == 1) {
117                 parameters.add("default", decodeEscapes(part));
118             } else if (keyValue.size() > 1) {
119                 String key = keyValue.get(0);
120                 String value = keyValue.get(1);
121                 parameters.add(decodeEscapes(key), decodeEscapes(value));
122             }
123         }
124 
125         int endOperator = operator.length();
126 
127         if (operator.charAt(operator.length() - 1) == ')') {
128             endOperator--;
129         }
130 
131         ArrayList<Node> children = parseArguments(operator.substring(firstParen + 1, endOperator),
132                                                   offset + firstParen + 1);
133         Node result = new Node(operatorName, parameters, children, offset);
134         return result;
135     }
136 
137     /***
138      * Find the end of an escaped query region.
139      * We assume that query.charAt(start) == '@'.  The
140      * next charater is the boundary symbol for the escaped text.
141      * We move forward in the string until we see the boundary symbol again.
142      * If the escaped region isn't well formed (that is, there is no 
143      * initial boundary symbol, or we only see the boundary symbol once),
144      * query.length() is returned.
145      */
146     public static int findEscapedEnd(String query, int start) {
147         assert query.charAt(start) == '@';
148 
149         // guard against the error case
150         if (query.length() < start + 2) {
151             return query.length();
152         }
153         char doneChar = query.charAt(start + 1);
154         int result = query.indexOf(doneChar, start + 2);
155 
156         if (result < 0) {
157             return query.length();
158         }
159         return result;
160     }
161 
162     /***
163      * <p>Find the end of an operator.  We assume that query.charAt(start)
164      * is a pound sign.  We skip forward in the query looking for
165      * the end of the operator by looking at parentheses; we know we're
166      * done when the parentheses are balanced and we've seen at least
167      * one open parenthesis.  This method skips over escaped regions.</p>
168      */
169     public static int findOperatorEnd(String query, int start) {
170         int i = start;
171         int open = 0;
172         int closed = 0;
173 
174         for (; i < query.length() && (open != closed || open == 0); i++) {
175             char current = query.charAt(i);
176 
177             if (current == '(') {
178                 open++;
179             } else if (current == ')') {
180                 closed++;
181             } else if (current == '@') {
182                 i = findEscapedEnd(query, i);
183             }
184         }
185 
186         return i;
187     }
188 
189     public static int findTextEnd(String query, int start) {
190         int i = start;
191 
192         for (; i < query.length(); i++) {
193             char current = query.charAt(i);
194 
195             if (Character.isWhitespace(current)) {
196                 break;
197             } else if (current == '@') {
198                 i = findEscapedEnd(query, i);
199             }
200         }
201 
202         return i;
203     }
204 
205     public static String decodeEscapes(String escapedString) {
206         StringBuilder builder = new StringBuilder();
207         char escapeChar = ' ';
208         boolean inEscape = false;
209 
210         for (int i = 0; i < escapedString.length(); i++) {
211             char current = escapedString.charAt(i);
212 
213             if (!inEscape && current == '@' && i + 1 < escapedString.length()) {
214                 escapeChar = escapedString.charAt(i + 1);
215                 inEscape = true;
216                 i += 1;
217             } else if (inEscape && current == escapeChar) {
218                 inEscape = false;
219             } else {
220                 builder.append(escapedString.charAt(i));
221             }
222         }
223 
224         return builder.toString();
225     }
226 
227     public static ArrayList<String> splitStringRespectingEscapes(String query, char split) {
228         ArrayList<String> chunks = new ArrayList<String>();
229         int start = 0;
230 
231         for (int i = 0; i < query.length(); i++) {
232             char current = query.charAt(i);
233 
234             if (current == split) {
235                 chunks.add(query.substring(start, i));
236                 start = i + 1;
237             } else if (current == '@') {
238                 i = findEscapedEnd(query, i);
239             }
240         }
241 
242         if (start < query.length() || query.length() == 0) {
243             chunks.add(query.substring(start));
244         }
245 
246         return chunks;
247     }
248     
249     public static Node fieldOrNode(ArrayList<String> fieldNames, int offset) {
250         assert fieldNames.size() > 0 : "Can't make an or node with no fields";
251         Node result;
252         
253         if (fieldNames.size() == 1) {
254             result = new Node("field", fieldNames.get(0), offset);
255         } else {
256             ArrayList<Node> children = new ArrayList<Node>();
257             
258             for (int i = 0; i < fieldNames.size(); ++i) {
259                 children.add(new Node("field", fieldNames.get(i), offset));
260             }
261             
262             result = new Node("extentor", children, offset);
263         }
264         
265         return result;
266     }
267 
268     public static Node parseTerm(String query, int offset) {
269         // step 1, split at periods
270         ArrayList<String> chunks = splitStringRespectingEscapes(query, '.');
271         
272         // step 2, decode each chunk.  The first chunk is the term text,
273         // while all remaining chunks describe fields.
274         Node result = new Node("text", decodeEscapes(chunks.get(0)), offset);
275         result = parseFields(result, chunks.subList(1, chunks.size()), offset);
276 
277         return result;
278     }
279 
280     public static int findOperatorExpressionEnd(String query, int offset) {
281         // this is an operator, so scan for balanced parentheses,
282         // not including escaped regions
283         int end = findOperatorEnd(query, offset);
284         int textEnd = findTextEnd(query, end);
285 
286         if (end != textEnd && query.charAt(end) == '.')
287             return textEnd;
288 
289         return end;
290     }
291 
292     /***
293      * Parses arguments to an operator.  This method can also be used to parse
294      * the elements of a query, since terms in a query are implicitly in a #combine
295      * operator.
296      * 
297      * @param query The query, or a substring of it, that contains argument text.
298      * @param offset The offset into the full query string (offset == 0 if query is the full query).
299      * @return a List of Nodes, which represent the query.
300      */
301     public static ArrayList<Node> parseArguments(String query, int offset) {
302         ArrayList<Node> arguments = new ArrayList<Node>();
303         int start = 0;
304         boolean inElement = false;
305 
306         // scan the string, looking for tokens.  Tokens are either operators (#\w+(...)), words, or escaped.
307         for (int i = 0; i < query.length(); i++) {
308             char current = query.charAt(i);
309 
310             if (!Character.isWhitespace(current)) {
311                 if (current == '#') {
312                     Node child = parseOperatorExpression(query, i, offset);
313                     arguments.add(child);
314                     i = findOperatorExpressionEnd(query, i);
315                 } else {
316                     int end = findTextEnd(query, i);
317                     Node child = parseTerm(query.substring(i, end), offset + i);
318                     arguments.add(child);
319                     i = end;
320                 }
321             }
322         }
323 
324         // we're at the end of the string
325         if (inElement) {
326             Node child = new Node("text", query.substring(start), offset + start);
327             arguments.add(child);
328         }
329 
330         return arguments;
331     }
332 
333     public static Node parse(String query) {
334         ArrayList<Node> arguments = parseArguments(query, 0);
335 
336         if (query.length() == 0) {
337             return new Node("text", "");
338         } else if (arguments.size() == 1) {
339             return arguments.get(0);
340         } else {
341             return new Node("combine", arguments, 0);
342         }
343     }
344 
345     public static Set<String> findQueryTerms(Node queryTree) {
346         HashSet<String> queryTerms = new HashSet<String>();
347 
348         if (queryTree.getOperator().equals("text")) {
349             queryTerms.add(queryTree.getDefaultParameter());
350         } else {
351             for (Node child : queryTree.getInternalNodes()) {
352                 queryTerms.addAll(findQueryTerms(child));
353             }
354         }
355 
356         return queryTerms;
357     }
358 
359     private static Node parseFields(Node result, List<String> chunks, int offset) {
360         for (int i = 0; i < chunks.size(); i++) {
361             ArrayList<Node> children = new ArrayList<Node>();
362             children.add(result);
363             String fieldText = chunks.get(i);
364             boolean isSmoothingField = false;
365             if (fieldText.startsWith("(") && fieldText.endsWith(")")) {
366                 isSmoothingField = true;
367                 fieldText = fieldText.substring(1, fieldText.length() - 1);
368             }
369             ArrayList<String> fieldNames = splitStringRespectingEscapes(fieldText, ',');
370             Node fieldNode = fieldOrNode(fieldNames, offset);
371             children.add(fieldNode);
372             if (isSmoothingField) {
373                 result = new Node("smoothinside", children, offset);
374             } else {
375                 result = new Node("inside", children, offset);
376             }
377         }
378         return result;
379     }
380 
381     private static Node parseOperatorExpression(String query, int i, int offset) {
382         // this is an operator, so scan for balanced parentheses,
383         // not including escaped regions
384         int end = findOperatorEnd(query, i);
385         Node child = parseOperator(query.substring(i, end), offset + i);
386         // operators may end with a field expression, so parse that too:
387         int textEnd = findTextEnd(query, end);
388         if (textEnd != end && query.charAt(end) == '.') {
389             String remainingText = query.substring(end + 1, textEnd);
390             ArrayList<String> chunks = splitStringRespectingEscapes(remainingText, '.');
391             child = parseFields(child, chunks, end);
392         }
393         return child;
394     }
395 }