1
2 package org.galagosearch.core.retrieval.query;
3
4 import java.util.ArrayList;
5 import java.util.HashSet;
6 import java.util.List;
7 import java.util.Set;
8 import org.galagosearch.tupleflow.Parameters;
9
10 /***
11 * Valid query language syntax:
12 *
13 * #operator:argument(...)
14 * term, or term.field, or term.field.field, etc.
15 *
16 * @author trevor
17 */
18 public class StructuredQuery {
19 /***
20 * Copies a query tree using a traversal object.
21 *
22 * Both traversal.beforeNode and traversal.afterNode will be called
23 * for every Node object in the query tree.
24 * The traversal.beforeNode method will be called with a parent node
25 * before any of its children (pre-order), while traversal.afterNode method
26 * will be called on the parent after all of its children (post-order).
27 *
28 * The afterNode method will be called with a new copy of the original
29 * node, with the children replaced by new copies. afterNode can either
30 * return its parameter or a modified node.
31 */
32 public static Node copy(Traversal traversal, Node tree) throws Exception {
33 ArrayList<Node> children = new ArrayList<Node>();
34 traversal.beforeNode(tree);
35
36 for (Node n : tree.getInternalNodes()) {
37 Node child = copy(traversal, n);
38 children.add(child);
39 }
40
41 Node newNode = new Node(tree.getOperator(), tree.getParameters(),
42 children, tree.getPosition());
43 return traversal.afterNode(newNode);
44 }
45
46 /***
47 * Walks a query tree with a traversal object.
48 *
49 * Both traversal.beforeNode and traversal.afterNode will be called
50 * for every Node object in the query tree.
51 * The traversal.beforeNode method will be called with a parent node
52 * before any of its children (pre-order), while traversal.afterNode method
53 * will be called on the parent after all of its children (post-order).
54 */
55 public static void walk(Traversal traversal, Node tree) throws Exception {
56 traversal.beforeNode(tree);
57 for (Node n : tree.getInternalNodes()) {
58 walk(traversal, n);
59 }
60 traversal.afterNode(tree);
61 }
62
63 /***
64 * Splits an input string, which may include escapes,
65 * into chunks based on a delimiter character. It does not split
66 * on delimiters that appear in escaped regions.
67 */
68 public static ArrayList<String> splitOn(String text, char delimiter) {
69 int start = 0;
70 ArrayList<String> result = new ArrayList<String>();
71
72 for (int i = 0; i < text.length(); i++) {
73 char c = text.charAt(i);
74
75 if (c == '@') {
76 i = StructuredQuery.findEscapedEnd(text, i);
77 } else if (c == delimiter) {
78 result.add(text.substring(start, i));
79 start = i + 1;
80 }
81 }
82
83 if (start != text.length()) {
84 result.add(text.substring(start));
85 }
86
87 return result;
88 }
89
90 /***
91 * <p>Parses an operator from the string query. This method assumes that
92 * operator is a pound sign ('#') followed by some text, followed by an open
93 * parentheses. Parsing stops at the parenthesis.</p>
94 *
95 * <p>Note that parsing starts at index 0, not at "offset". The offset is used purely
96 * for giving parse error information, and represents the offset of the operator string
97 * in the larger query string.</p>
98 */
99 public static Node parseOperator(String operator, int offset) {
100 assert operator.charAt(0) == '#';
101 int firstParen = operator.indexOf('(');
102
103 if (firstParen < 0) {
104 return new Node("unknown", new ArrayList<Node>(), offset);
105 }
106
107 String operatorText = operator.substring(0, firstParen);
108 ArrayList<String> operatorParts = splitOn(operatorText.substring(1), ':');
109
110 String operatorName = operatorParts.get(0);
111 Parameters parameters = new Parameters();
112
113 for (String part : operatorParts.subList(1, operatorParts.size())) {
114 ArrayList<String> keyValue = splitOn(part, '=');
115
116 if (keyValue.size() == 1) {
117 parameters.add("default", decodeEscapes(part));
118 } else if (keyValue.size() > 1) {
119 String key = keyValue.get(0);
120 String value = keyValue.get(1);
121 parameters.add(decodeEscapes(key), decodeEscapes(value));
122 }
123 }
124
125 int endOperator = operator.length();
126
127 if (operator.charAt(operator.length() - 1) == ')') {
128 endOperator--;
129 }
130
131 ArrayList<Node> children = parseArguments(operator.substring(firstParen + 1, endOperator),
132 offset + firstParen + 1);
133 Node result = new Node(operatorName, parameters, children, offset);
134 return result;
135 }
136
137 /***
138 * Find the end of an escaped query region.
139 * We assume that query.charAt(start) == '@'. The
140 * next charater is the boundary symbol for the escaped text.
141 * We move forward in the string until we see the boundary symbol again.
142 * If the escaped region isn't well formed (that is, there is no
143 * initial boundary symbol, or we only see the boundary symbol once),
144 * query.length() is returned.
145 */
146 public static int findEscapedEnd(String query, int start) {
147 assert query.charAt(start) == '@';
148
149
150 if (query.length() < start + 2) {
151 return query.length();
152 }
153 char doneChar = query.charAt(start + 1);
154 int result = query.indexOf(doneChar, start + 2);
155
156 if (result < 0) {
157 return query.length();
158 }
159 return result;
160 }
161
162 /***
163 * <p>Find the end of an operator. We assume that query.charAt(start)
164 * is a pound sign. We skip forward in the query looking for
165 * the end of the operator by looking at parentheses; we know we're
166 * done when the parentheses are balanced and we've seen at least
167 * one open parenthesis. This method skips over escaped regions.</p>
168 */
169 public static int findOperatorEnd(String query, int start) {
170 int i = start;
171 int open = 0;
172 int closed = 0;
173
174 for (; i < query.length() && (open != closed || open == 0); i++) {
175 char current = query.charAt(i);
176
177 if (current == '(') {
178 open++;
179 } else if (current == ')') {
180 closed++;
181 } else if (current == '@') {
182 i = findEscapedEnd(query, i);
183 }
184 }
185
186 return i;
187 }
188
189 public static int findTextEnd(String query, int start) {
190 int i = start;
191
192 for (; i < query.length(); i++) {
193 char current = query.charAt(i);
194
195 if (Character.isWhitespace(current)) {
196 break;
197 } else if (current == '@') {
198 i = findEscapedEnd(query, i);
199 }
200 }
201
202 return i;
203 }
204
205 public static String decodeEscapes(String escapedString) {
206 StringBuilder builder = new StringBuilder();
207 char escapeChar = ' ';
208 boolean inEscape = false;
209
210 for (int i = 0; i < escapedString.length(); i++) {
211 char current = escapedString.charAt(i);
212
213 if (!inEscape && current == '@' && i + 1 < escapedString.length()) {
214 escapeChar = escapedString.charAt(i + 1);
215 inEscape = true;
216 i += 1;
217 } else if (inEscape && current == escapeChar) {
218 inEscape = false;
219 } else {
220 builder.append(escapedString.charAt(i));
221 }
222 }
223
224 return builder.toString();
225 }
226
227 public static ArrayList<String> splitStringRespectingEscapes(String query, char split) {
228 ArrayList<String> chunks = new ArrayList<String>();
229 int start = 0;
230
231 for (int i = 0; i < query.length(); i++) {
232 char current = query.charAt(i);
233
234 if (current == split) {
235 chunks.add(query.substring(start, i));
236 start = i + 1;
237 } else if (current == '@') {
238 i = findEscapedEnd(query, i);
239 }
240 }
241
242 if (start < query.length() || query.length() == 0) {
243 chunks.add(query.substring(start));
244 }
245
246 return chunks;
247 }
248
249 public static Node fieldOrNode(ArrayList<String> fieldNames, int offset) {
250 assert fieldNames.size() > 0 : "Can't make an or node with no fields";
251 Node result;
252
253 if (fieldNames.size() == 1) {
254 result = new Node("field", fieldNames.get(0), offset);
255 } else {
256 ArrayList<Node> children = new ArrayList<Node>();
257
258 for (int i = 0; i < fieldNames.size(); ++i) {
259 children.add(new Node("field", fieldNames.get(i), offset));
260 }
261
262 result = new Node("extentor", children, offset);
263 }
264
265 return result;
266 }
267
268 public static Node parseTerm(String query, int offset) {
269
270 ArrayList<String> chunks = splitStringRespectingEscapes(query, '.');
271
272
273
274 Node result = new Node("text", decodeEscapes(chunks.get(0)), offset);
275 result = parseFields(result, chunks.subList(1, chunks.size()), offset);
276
277 return result;
278 }
279
280 public static int findOperatorExpressionEnd(String query, int offset) {
281
282
283 int end = findOperatorEnd(query, offset);
284 int textEnd = findTextEnd(query, end);
285
286 if (end != textEnd && query.charAt(end) == '.')
287 return textEnd;
288
289 return end;
290 }
291
292 /***
293 * Parses arguments to an operator. This method can also be used to parse
294 * the elements of a query, since terms in a query are implicitly in a #combine
295 * operator.
296 *
297 * @param query The query, or a substring of it, that contains argument text.
298 * @param offset The offset into the full query string (offset == 0 if query is the full query).
299 * @return a List of Nodes, which represent the query.
300 */
301 public static ArrayList<Node> parseArguments(String query, int offset) {
302 ArrayList<Node> arguments = new ArrayList<Node>();
303 int start = 0;
304 boolean inElement = false;
305
306
307 for (int i = 0; i < query.length(); i++) {
308 char current = query.charAt(i);
309
310 if (!Character.isWhitespace(current)) {
311 if (current == '#') {
312 Node child = parseOperatorExpression(query, i, offset);
313 arguments.add(child);
314 i = findOperatorExpressionEnd(query, i);
315 } else {
316 int end = findTextEnd(query, i);
317 Node child = parseTerm(query.substring(i, end), offset + i);
318 arguments.add(child);
319 i = end;
320 }
321 }
322 }
323
324
325 if (inElement) {
326 Node child = new Node("text", query.substring(start), offset + start);
327 arguments.add(child);
328 }
329
330 return arguments;
331 }
332
333 public static Node parse(String query) {
334 ArrayList<Node> arguments = parseArguments(query, 0);
335
336 if (query.length() == 0) {
337 return new Node("text", "");
338 } else if (arguments.size() == 1) {
339 return arguments.get(0);
340 } else {
341 return new Node("combine", arguments, 0);
342 }
343 }
344
345 public static Set<String> findQueryTerms(Node queryTree) {
346 HashSet<String> queryTerms = new HashSet<String>();
347
348 if (queryTree.getOperator().equals("text")) {
349 queryTerms.add(queryTree.getDefaultParameter());
350 } else {
351 for (Node child : queryTree.getInternalNodes()) {
352 queryTerms.addAll(findQueryTerms(child));
353 }
354 }
355
356 return queryTerms;
357 }
358
359 private static Node parseFields(Node result, List<String> chunks, int offset) {
360 for (int i = 0; i < chunks.size(); i++) {
361 ArrayList<Node> children = new ArrayList<Node>();
362 children.add(result);
363 String fieldText = chunks.get(i);
364 boolean isSmoothingField = false;
365 if (fieldText.startsWith("(") && fieldText.endsWith(")")) {
366 isSmoothingField = true;
367 fieldText = fieldText.substring(1, fieldText.length() - 1);
368 }
369 ArrayList<String> fieldNames = splitStringRespectingEscapes(fieldText, ',');
370 Node fieldNode = fieldOrNode(fieldNames, offset);
371 children.add(fieldNode);
372 if (isSmoothingField) {
373 result = new Node("smoothinside", children, offset);
374 } else {
375 result = new Node("inside", children, offset);
376 }
377 }
378 return result;
379 }
380
381 private static Node parseOperatorExpression(String query, int i, int offset) {
382
383
384 int end = findOperatorEnd(query, i);
385 Node child = parseOperator(query.substring(i, end), offset + i);
386
387 int textEnd = findTextEnd(query, end);
388 if (textEnd != end && query.charAt(end) == '.') {
389 String remainingText = query.substring(end + 1, textEnd);
390 ArrayList<String> chunks = splitStringRespectingEscapes(remainingText, '.');
391 child = parseFields(child, chunks, end);
392 }
393 return child;
394 }
395 }