View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.retrieval.query;
4   
5   import java.util.ArrayList;
6   import java.util.List;
7   
8   /***
9    * <p>SimpleQuery parses the kind of queries you might expect for a end-user search engine.
10   * The format is also meant to be similar to Lucene's query format.</p>
11   * 
12   * Queries can be single terms:<br/>
13   *    <tt>white house</tt><br/>
14   * or phrases:<br/>
15   *    <tt>"white house"</tt><br/>
16   * and have fields:
17   *    <tt>title:"white house"</tt><br/>
18   * or weights:<br/>
19   *    <tt>white^4 house^2</tt><br/>
20   * 
21   * <p>A query can be parsed into a list of QueryTerms or translated into a tree of Nodes
22   * which can be used with the StructuredRetrieval code.</p>
23   * 
24   * @author trevor
25   */
26  public class SimpleQuery {
27      public static class QueryTerm {
28          public QueryTerm(String text) {
29              this.weight = 1.0;
30              this.field = null;
31              this.text = text;
32          }
33  
34          public QueryTerm(String text, String field, double weight) {
35              this.text = text;
36              this.field = field;
37              this.weight = weight;
38          }
39  
40          @Override
41          public boolean equals(Object o) {
42              if (!(o instanceof QueryTerm))
43                  return false;
44  
45              QueryTerm other = (QueryTerm) o;
46              return text.equals(other.text) &&
47                      ((field != null) ? field.equals(other.field) : other.field == null) &&
48                      weight == other.weight;
49          }
50  
51          @Override
52          public int hashCode() {
53              int hash = 5;
54              hash = 41 * hash + (this.text != null ? this.text.hashCode() : 0);
55              hash = 41 * hash + (this.field != null ? this.field.hashCode() : 0);
56              hash = 41 * hash + (int) (Double.doubleToLongBits(this.weight) ^ (Double.
57                      doubleToLongBits(this.weight) >>> 32));
58              return hash;
59          }
60  
61          @Override
62          public String toString() {
63              String term = text;
64  
65              // if this is a multi-word query, enclose it in quotes
66              if (term.contains(" ")) {
67                  term = "\"" + term + "\"";            // use the minimum amount of syntax necessary to
68              // express the query.  If everything is specified, 
69              // the format is field:term^weight.
70              }
71              if (field != null && weight != 1.0) {
72                  return String.format("%s:%s^%f", field, term, weight);
73              }
74              if (field != null) {
75                  return String.format("%s:%s", field, term);
76              }
77              if (weight != 1.0) {
78                  return String.format("%s^%f", term, weight);
79              }
80              return text;
81          }
82          public String text;
83          public String field;
84          public double weight;
85      }
86  
87      /*** 
88       * The format of the query term is <tt>field:term^weight</tt>.
89       * Both the field and the weight are optional, and the term may
90       * be enclosed in quotes.
91       *
92       * @return A QueryTerm object describing the query term.
93       */
94      public static QueryTerm parseQueryTerm(String term) {
95          double weight = 1.0;
96          String field = null;
97  
98          int colon = term.indexOf(':');
99          if (colon >= 0) {
100             field = term.substring(0, colon);
101             term = term.substring(colon + 1);
102         }
103 
104         int caret = term.indexOf('^');
105         if (caret >= 0) {
106             weight = Double.parseDouble(term.substring(caret + 1));
107             term = term.substring(0, caret);
108         }
109 
110         if (term.startsWith("\"")) {
111             term = term.substring(1);
112         }
113         if (term.endsWith("\"")) {
114             term = term.substring(0, term.length() - 1);
115         }
116         return new QueryTerm(term, field, weight);
117     }
118 
119     public static List<String> textQueryTerms(String query) {
120         boolean inQuote = false;
121         int firstNonSpace = query.length() + 1;
122         int i = 0;
123         ArrayList<String> results = new ArrayList<String>();
124 
125         // each loop parses a single term
126         while (i < query.length()) {
127             // parsing goes in two phases; first we're trying to bypass inital
128             // spaces before a query term.  after that point, we parse until the
129             // next space that's not in quotes.
130             for (; i < query.length(); i++) {
131                 char c = query.charAt(i);
132 
133                 if (Character.isSpaceChar(c)) {
134                     if (!inQuote) {
135                         if (firstNonSpace < i) {
136                             String term = query.substring(firstNonSpace, i);
137                             results.add(term);
138                         }
139                         firstNonSpace = query.length() + 1;
140                     }
141                 } else if (c == '"') {
142                     firstNonSpace = Math.min(firstNonSpace, i);
143                     inQuote = !inQuote;
144                 } else {
145                     firstNonSpace = Math.min(firstNonSpace, i);
146                 }
147             }
148         }
149 
150         if (firstNonSpace < query.length()) {
151             results.add(query.substring(firstNonSpace, query.length()));
152         }
153 
154         return results;
155     }
156 
157     public static List<QueryTerm> parse(String query) {
158         ArrayList<QueryTerm> results = new ArrayList<QueryTerm>();
159         int position = 0;
160         String term = null;
161 
162         List<String> textTerms = textQueryTerms(query);
163         ArrayList<QueryTerm> parsedTerms = new ArrayList<QueryTerm>();
164 
165         for (String textTerm : textTerms) {
166             parsedTerms.add(parseQueryTerm(textTerm));
167         }
168 
169         return parsedTerms;
170     }
171 
172     public static Node parseTree(String query) {
173         List<QueryTerm> terms = parse(query);
174         ArrayList<Node> nodes = new ArrayList<Node>();
175 
176         for (QueryTerm term : terms) {
177             Node termNode = new Node("text", term.text);
178             // if this is a phrase, put the terms in a ordered window
179             if (term.text.contains(" ")) {
180                 String[] phraseTerms = term.text.split(" ");
181                 ArrayList<Node> children = new ArrayList<Node>();
182                 for (String phraseTerm : phraseTerms) {
183                     children.add(new Node("text", phraseTerm));
184                 }
185                 termNode = new Node("ordered", "1", children);
186             }
187             // if this is in a field, add the field restriction
188             if (term.field != null) {
189                 ArrayList<Node> children = new ArrayList<Node>();
190                 children.add(termNode);
191                 children.add(new Node("field", term.field));
192                 termNode = new Node("inside", children);
193             }
194             // if this is weighted, scale it
195             if (term.weight != 1.0) {
196                 ArrayList<Node> children = new ArrayList<Node>();
197                 children.add(termNode);
198                 termNode = new Node("scale", Double.toString(term.weight), children);
199             }
200             nodes.add(termNode);
201         }
202 
203         if (nodes.size() < 1)
204             return null;
205         
206         if (nodes.size() == 1)
207             return nodes.get(0);
208 
209         return new Node("combine", nodes);
210     }
211 }