View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.store;
4   
5   import java.io.IOException;
6   import java.util.ArrayList;
7   import java.util.Collections;
8   import java.util.HashSet;
9   import java.util.Set;
10  import org.galagosearch.core.parse.Document;
11  import org.galagosearch.core.parse.TagTokenizer;
12  
13  /***
14   * This is a very simple snippet generator for generating small summaries of returned
15   * documents.
16   * 
17   * @author trevor
18   */
19  public class SnippetGenerator {
20      public static final int width = 5;
21  
22      public static class Match {
23          public Match(String term, int index) {
24              this(term, index, index + 1);
25          }
26  
27          public Match(String term, int start, int end) {
28              this.term = term;
29              this.start = start;
30              this.end = end;
31          }
32          String term;
33          int start;
34          int end;
35      }
36  
37      public static class SnippetRegion {
38          int start;
39          int end;
40          ArrayList<Match> matches;
41  
42          public SnippetRegion(String term, int index, int width, int maximum) {
43              matches = new ArrayList();
44              matches.add(new Match(term, index));
45              start = Math.max(index - width, 0);
46              end = Math.min(maximum, index + width);
47          }
48  
49          public SnippetRegion(ArrayList<Match> m, int s, int e) {
50              matches = m;
51              start = s;
52              end = e;
53          }
54  
55          public boolean overlap(SnippetRegion o) {
56              return (start <= o.start && end >= o.start) ||
57                      (start <= o.end && end >= o.end);
58          }
59  
60          public boolean within(SnippetRegion o, int distance) {
61              if (overlap(o)) {
62                  return true;
63              }
64              if (Math.abs(start - o.end) <= distance) {
65                  return true;
66              }
67              if (Math.abs(end - o.start) <= distance) {
68                  return true;
69              }
70              return false;
71          }
72  
73          public SnippetRegion merge(SnippetRegion o) {
74              ArrayList<Match> m = new ArrayList();
75              m.addAll(matches);
76              m.addAll(o.matches);
77  
78              SnippetRegion result =
79                      new SnippetRegion(m, Math.min(start, o.start), Math.max(end, o.end));
80              return result;
81          }
82  
83          public boolean equals(SnippetRegion o) {
84              return start == o.start && end == o.end;
85          }
86  
87          public int size() {
88              return this.end - this.start;
89          }
90  
91          public ArrayList<Match> getMatches() {
92              return matches;
93          }
94      }
95  
96      public class Snippet {
97          private ArrayList<SnippetRegion> regions;
98          double score;
99  
100         public Snippet(ArrayList<SnippetRegion> regions) {
101             this.regions = regions;
102         }
103 
104         @Override
105         public int hashCode() {
106             int result = 0;
107 
108             for (SnippetRegion region : regions) {
109                 result += region.end * 3 + region.start;
110                 result *= 5;
111                 result += region.getMatches().size();
112             }
113 
114             return result;
115         }
116 
117         @Override
118         public boolean equals(Object o) {
119             if (!(o instanceof Snippet)) return false;
120             Snippet other = (Snippet) o;
121 
122             if (other.regions.size() != regions.size()) {
123                 return false;
124             }
125             for (int i = 0; i < regions.size(); i++) {
126                 if (regions.get(i).equals(other.regions.get(i))) {
127                     continue;
128                 }
129                 return false;
130             }
131 
132             return true;
133         }
134 
135         public double score() {
136             if (score == 0) {
137                 cacheScore();
138             }
139             return score;
140         }
141 
142         public void cacheScore() {
143             // Factors:  big snippets are discounted
144             //           coverage is good
145             //           proximity is good
146             //           close to document start is good
147 
148             int wordLength = 0;
149             int prox = 0;
150 
151             HashSet<String> words = new HashSet<String>();
152 
153             for (SnippetRegion region : regions) {
154                 wordLength += region.size();
155                 prox += Math.pow(2, region.getMatches().size());
156 
157                 for (SnippetGenerator.Match m : region.getMatches()) {
158                     words.add(m.term);
159                 }
160             }
161 
162             score = -Math.pow(1.2, Math.min(0, wordLength - 150)) + prox + Math.pow(words.size(), 2);
163         }
164 
165         /***
166          * <p>This is part of an aborted attempt to score many candidate snippets to produce
167          * the best one.  It returns many different candidates which can then be scored
168          * using the score method.  As coded, this method is too slow to be useful.</o>
169          * 
170          * @return
171          */
172         public ArrayList<Snippet> expand() {
173             ArrayList<Snippet> results = new ArrayList();
174             int size = 0;
175 
176             for (SnippetRegion region : regions) {
177                 size += region.size();
178             }
179 
180             if (size > 150) {
181                 // try deletions
182                 for (int i = 0; i < regions.size(); i++) {
183                     ArrayList<SnippetRegion> newRegions = new ArrayList();
184 
185                     newRegions.addAll(regions.subList(0, i));
186                     newRegions.addAll(regions.subList(i + 1, regions.size()));
187 
188                     results.add(new Snippet(newRegions));
189                 }
190             }
191 
192             // try merges
193             for (int i = 0; i < regions.size() - 1; i++) {
194                 if (regions.get(i + 1).start - regions.get(i).end > 100) {
195                     continue;
196                 }
197                 ArrayList<SnippetRegion> newRegions = new ArrayList();
198 
199                 newRegions.addAll(regions.subList(0, i));
200                 SnippetRegion merged = regions.get(i).merge(regions.get(i + 1));
201                 newRegions.add(merged);
202                 newRegions.addAll(regions.subList(i + 2, regions.size()));
203 
204                 results.add(new Snippet(newRegions));
205             }
206 
207             return results;
208         }
209     }
210 
211     private Document parseAsDocument(String text, ArrayList<TagTokenizer.Pair> positions) throws IOException {
212         Document document = new Document();
213         document.text = text;
214 
215         // Tokenize the document
216         TagTokenizer tokenizer = new TagTokenizer();
217         tokenizer.process(document);
218 
219         if (positions != null) {
220             positions.addAll(tokenizer.getTokenPositions());
221         }
222         return document;
223     }
224 
225     /***
226      * <p>Highlights query terms in a string of document text.  This is most useful
227      * for highlighting query terms in document titles.</p>
228      */
229 
230     public String highlight(String documentText, Set<String> queryTerms) throws IOException {
231         ArrayList<TagTokenizer.Pair> positions = new ArrayList();
232         Document document = parseAsDocument(documentText, positions);
233 
234         SnippetRegion merged = findSingleRegion(document, queryTerms);
235         Snippet best = new Snippet(new ArrayList(Collections.singletonList(merged)));
236 
237         String result = buildHtmlString(best, document, positions);
238         return result;
239     }
240 
241     /***
242      * <p>Produces a short query-dependent summary of a document with query terms
243      * highlighted.  The result is an HTML string.</p>
244      */
245 
246     public String getSnippet(String documentText, Set<String> queryTerms) throws IOException {
247         ArrayList<TagTokenizer.Pair> positions = new ArrayList();
248         Document document = parseAsDocument(documentText, positions);
249 
250         return generateSnippet(document, positions, queryTerms);
251     }
252 
253     private String generateSnippet(
254             final Document document,
255             final ArrayList<TagTokenizer.Pair> positions,
256             final Set<String> queryTerms) {
257         ArrayList<SnippetRegion> regions = findMatches(document, queryTerms);
258         ArrayList<SnippetRegion> finalRegions = combineRegions(regions);
259         Snippet best = new Snippet(finalRegions);
260 
261         String result = buildHtmlString(best, document, positions);
262         return result;
263     }
264 
265     private SnippetRegion findSingleRegion(final Document document, final Set<String> queryTerms) {
266         // Make a snippet region object for each term occurrence in the document,
267         // while also counting matches
268         ArrayList<Match> matches = new ArrayList();
269 
270         for (int i = 0; i < document.terms.size(); i++) {
271             String term = document.terms.get(i);
272             if (queryTerms.contains(term)) {
273                 matches.add(new Match(term, i));
274             }
275         }
276 
277         return new SnippetRegion(matches, 0, document.terms.size());
278     }
279 
280     private ArrayList<SnippetRegion> findMatches(final Document document, final Set<String> queryTerms) {
281         // Make a snippet region object for each term occurrence in the document,
282         // while also counting matches
283         ArrayList<SnippetRegion> regions = new ArrayList();
284 
285         for (int i = 0; i < document.terms.size(); i++) {
286             String term = document.terms.get(i);
287             if (queryTerms.contains(term)) {
288                 regions.add(new SnippetRegion(term, i, width, document.terms.size()));
289             }
290         }
291         return regions;
292     }
293 
294     public String buildHtmlString(Snippet best, Document document, ArrayList<TagTokenizer.Pair> positions) {
295         StringBuilder builder = new StringBuilder();
296 
297         for (SnippetRegion region : best.regions) {
298             if (region.start != 0) {
299                 builder.append("...");
300             }
301             int startChar = positions.get(region.start).start;
302             int endChar = positions.get(region.end - 1).end;
303             int start = 0;
304 
305             // section string
306             String section = document.text.substring(startChar, endChar);
307 
308             for (Match m : region.matches) {
309                 int startMatchChar = positions.get(m.start).start - startChar;
310                 int endMatchChar = positions.get(m.end - 1).end - startChar;
311 
312                 String intermediate = stripTags(section.substring(start, startMatchChar));
313                 builder.append(intermediate);
314                 builder.append("<strong>");
315                 builder.append(stripTags(section.substring(startMatchChar, endMatchChar)));
316                 builder.append("</strong>");
317                 start = endMatchChar;
318             }
319 
320             if (start >= 0) {
321                 builder.append(stripTags(section.substring(start)));
322             }
323         }
324 
325         if (best.regions.size() > 1 && best.regions.get(best.regions.size() - 1).end != document.terms.
326                 size()) {
327             builder.append("...");
328         }
329         return builder.toString();
330     }
331 
332     public String stripTag(String tag, String input) {
333         input = input.replaceAll("<" + tag.toLowerCase() + "[^>]*>.*?</" + tag.toLowerCase() + ">",
334                                  "");
335         input = input.replaceAll("<" + tag.toUpperCase() + "[^>]*>.*?</" + tag.toUpperCase() + ">",
336                                  "");
337         return input;
338     }
339 
340     public String stripTags(String input) {
341         input = stripTag("script", input);
342         input = stripTag("style", input);
343         input = input.replaceAll("<!--.*?-->", "");
344 
345         input = input.replaceAll("&nbsp;", " ");
346         input = input.replaceAll("<[^>]*>", " ");
347         input = input.replaceAll("//s+", " ");
348 
349         return input;
350     }
351     // Goals:  1. find as many terms as possible
352     //         2. find terms that are close together
353     //         3. break on sentences when possible (?)
354     // BUGBUG: might not have all the terms highlighted here
355     public ArrayList<SnippetRegion> combineRegions(final ArrayList<SnippetRegion> regions) {
356         ArrayList<SnippetRegion> finalRegions = new ArrayList();
357         SnippetRegion last = null;
358         int snippetSize = 0;
359         int maxSize = 40;
360 
361         for (int i = 0; i < regions.size(); i++) {
362             SnippetRegion current = regions.get(i);
363 
364             if (last == null) {
365                 last = current;
366             } else if (last.overlap(current)) {
367                 SnippetRegion bigger = last.merge(current);
368 
369                 if (bigger.size() + snippetSize > maxSize) {
370                     finalRegions.add(last);
371                     last = null;
372                 } else {
373                     last = bigger;
374                 }
375             } else if (last.size() + snippetSize > maxSize) {
376                 break;
377             } else {
378                 finalRegions.add(last);
379                 snippetSize += last.size();
380                 last = current;
381             }
382         }
383 
384         if (last != null && snippetSize + last.size() < maxSize) {
385             finalRegions.add(last);
386         }
387 
388         return finalRegions;
389     }
390 }