1
2
3 package org.galagosearch.core.store;
4
5 import java.io.IOException;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.HashSet;
9 import java.util.Set;
10 import org.galagosearch.core.parse.Document;
11 import org.galagosearch.core.parse.TagTokenizer;
12
13 /***
14 * This is a very simple snippet generator for generating small summaries of returned
15 * documents.
16 *
17 * @author trevor
18 */
19 public class SnippetGenerator {
20 public static final int width = 5;
21
22 public static class Match {
23 public Match(String term, int index) {
24 this(term, index, index + 1);
25 }
26
27 public Match(String term, int start, int end) {
28 this.term = term;
29 this.start = start;
30 this.end = end;
31 }
32 String term;
33 int start;
34 int end;
35 }
36
37 public static class SnippetRegion {
38 int start;
39 int end;
40 ArrayList<Match> matches;
41
42 public SnippetRegion(String term, int index, int width, int maximum) {
43 matches = new ArrayList();
44 matches.add(new Match(term, index));
45 start = Math.max(index - width, 0);
46 end = Math.min(maximum, index + width);
47 }
48
49 public SnippetRegion(ArrayList<Match> m, int s, int e) {
50 matches = m;
51 start = s;
52 end = e;
53 }
54
55 public boolean overlap(SnippetRegion o) {
56 return (start <= o.start && end >= o.start) ||
57 (start <= o.end && end >= o.end);
58 }
59
60 public boolean within(SnippetRegion o, int distance) {
61 if (overlap(o)) {
62 return true;
63 }
64 if (Math.abs(start - o.end) <= distance) {
65 return true;
66 }
67 if (Math.abs(end - o.start) <= distance) {
68 return true;
69 }
70 return false;
71 }
72
73 public SnippetRegion merge(SnippetRegion o) {
74 ArrayList<Match> m = new ArrayList();
75 m.addAll(matches);
76 m.addAll(o.matches);
77
78 SnippetRegion result =
79 new SnippetRegion(m, Math.min(start, o.start), Math.max(end, o.end));
80 return result;
81 }
82
83 public boolean equals(SnippetRegion o) {
84 return start == o.start && end == o.end;
85 }
86
87 public int size() {
88 return this.end - this.start;
89 }
90
91 public ArrayList<Match> getMatches() {
92 return matches;
93 }
94 }
95
96 public class Snippet {
97 private ArrayList<SnippetRegion> regions;
98 double score;
99
100 public Snippet(ArrayList<SnippetRegion> regions) {
101 this.regions = regions;
102 }
103
104 @Override
105 public int hashCode() {
106 int result = 0;
107
108 for (SnippetRegion region : regions) {
109 result += region.end * 3 + region.start;
110 result *= 5;
111 result += region.getMatches().size();
112 }
113
114 return result;
115 }
116
117 @Override
118 public boolean equals(Object o) {
119 if (!(o instanceof Snippet)) return false;
120 Snippet other = (Snippet) o;
121
122 if (other.regions.size() != regions.size()) {
123 return false;
124 }
125 for (int i = 0; i < regions.size(); i++) {
126 if (regions.get(i).equals(other.regions.get(i))) {
127 continue;
128 }
129 return false;
130 }
131
132 return true;
133 }
134
135 public double score() {
136 if (score == 0) {
137 cacheScore();
138 }
139 return score;
140 }
141
142 public void cacheScore() {
143
144
145
146
147
148 int wordLength = 0;
149 int prox = 0;
150
151 HashSet<String> words = new HashSet<String>();
152
153 for (SnippetRegion region : regions) {
154 wordLength += region.size();
155 prox += Math.pow(2, region.getMatches().size());
156
157 for (SnippetGenerator.Match m : region.getMatches()) {
158 words.add(m.term);
159 }
160 }
161
162 score = -Math.pow(1.2, Math.min(0, wordLength - 150)) + prox + Math.pow(words.size(), 2);
163 }
164
165 /***
166 * <p>This is part of an aborted attempt to score many candidate snippets to produce
167 * the best one. It returns many different candidates which can then be scored
168 * using the score method. As coded, this method is too slow to be useful.</o>
169 *
170 * @return
171 */
172 public ArrayList<Snippet> expand() {
173 ArrayList<Snippet> results = new ArrayList();
174 int size = 0;
175
176 for (SnippetRegion region : regions) {
177 size += region.size();
178 }
179
180 if (size > 150) {
181
182 for (int i = 0; i < regions.size(); i++) {
183 ArrayList<SnippetRegion> newRegions = new ArrayList();
184
185 newRegions.addAll(regions.subList(0, i));
186 newRegions.addAll(regions.subList(i + 1, regions.size()));
187
188 results.add(new Snippet(newRegions));
189 }
190 }
191
192
193 for (int i = 0; i < regions.size() - 1; i++) {
194 if (regions.get(i + 1).start - regions.get(i).end > 100) {
195 continue;
196 }
197 ArrayList<SnippetRegion> newRegions = new ArrayList();
198
199 newRegions.addAll(regions.subList(0, i));
200 SnippetRegion merged = regions.get(i).merge(regions.get(i + 1));
201 newRegions.add(merged);
202 newRegions.addAll(regions.subList(i + 2, regions.size()));
203
204 results.add(new Snippet(newRegions));
205 }
206
207 return results;
208 }
209 }
210
211 private Document parseAsDocument(String text, ArrayList<TagTokenizer.Pair> positions) throws IOException {
212 Document document = new Document();
213 document.text = text;
214
215
216 TagTokenizer tokenizer = new TagTokenizer();
217 tokenizer.process(document);
218
219 if (positions != null) {
220 positions.addAll(tokenizer.getTokenPositions());
221 }
222 return document;
223 }
224
225 /***
226 * <p>Highlights query terms in a string of document text. This is most useful
227 * for highlighting query terms in document titles.</p>
228 */
229
230 public String highlight(String documentText, Set<String> queryTerms) throws IOException {
231 ArrayList<TagTokenizer.Pair> positions = new ArrayList();
232 Document document = parseAsDocument(documentText, positions);
233
234 SnippetRegion merged = findSingleRegion(document, queryTerms);
235 Snippet best = new Snippet(new ArrayList(Collections.singletonList(merged)));
236
237 String result = buildHtmlString(best, document, positions);
238 return result;
239 }
240
241 /***
242 * <p>Produces a short query-dependent summary of a document with query terms
243 * highlighted. The result is an HTML string.</p>
244 */
245
246 public String getSnippet(String documentText, Set<String> queryTerms) throws IOException {
247 ArrayList<TagTokenizer.Pair> positions = new ArrayList();
248 Document document = parseAsDocument(documentText, positions);
249
250 return generateSnippet(document, positions, queryTerms);
251 }
252
253 private String generateSnippet(
254 final Document document,
255 final ArrayList<TagTokenizer.Pair> positions,
256 final Set<String> queryTerms) {
257 ArrayList<SnippetRegion> regions = findMatches(document, queryTerms);
258 ArrayList<SnippetRegion> finalRegions = combineRegions(regions);
259 Snippet best = new Snippet(finalRegions);
260
261 String result = buildHtmlString(best, document, positions);
262 return result;
263 }
264
265 private SnippetRegion findSingleRegion(final Document document, final Set<String> queryTerms) {
266
267
268 ArrayList<Match> matches = new ArrayList();
269
270 for (int i = 0; i < document.terms.size(); i++) {
271 String term = document.terms.get(i);
272 if (queryTerms.contains(term)) {
273 matches.add(new Match(term, i));
274 }
275 }
276
277 return new SnippetRegion(matches, 0, document.terms.size());
278 }
279
280 private ArrayList<SnippetRegion> findMatches(final Document document, final Set<String> queryTerms) {
281
282
283 ArrayList<SnippetRegion> regions = new ArrayList();
284
285 for (int i = 0; i < document.terms.size(); i++) {
286 String term = document.terms.get(i);
287 if (queryTerms.contains(term)) {
288 regions.add(new SnippetRegion(term, i, width, document.terms.size()));
289 }
290 }
291 return regions;
292 }
293
294 public String buildHtmlString(Snippet best, Document document, ArrayList<TagTokenizer.Pair> positions) {
295 StringBuilder builder = new StringBuilder();
296
297 for (SnippetRegion region : best.regions) {
298 if (region.start != 0) {
299 builder.append("...");
300 }
301 int startChar = positions.get(region.start).start;
302 int endChar = positions.get(region.end - 1).end;
303 int start = 0;
304
305
306 String section = document.text.substring(startChar, endChar);
307
308 for (Match m : region.matches) {
309 int startMatchChar = positions.get(m.start).start - startChar;
310 int endMatchChar = positions.get(m.end - 1).end - startChar;
311
312 String intermediate = stripTags(section.substring(start, startMatchChar));
313 builder.append(intermediate);
314 builder.append("<strong>");
315 builder.append(stripTags(section.substring(startMatchChar, endMatchChar)));
316 builder.append("</strong>");
317 start = endMatchChar;
318 }
319
320 if (start >= 0) {
321 builder.append(stripTags(section.substring(start)));
322 }
323 }
324
325 if (best.regions.size() > 1 && best.regions.get(best.regions.size() - 1).end != document.terms.
326 size()) {
327 builder.append("...");
328 }
329 return builder.toString();
330 }
331
332 public String stripTag(String tag, String input) {
333 input = input.replaceAll("<" + tag.toLowerCase() + "[^>]*>.*?</" + tag.toLowerCase() + ">",
334 "");
335 input = input.replaceAll("<" + tag.toUpperCase() + "[^>]*>.*?</" + tag.toUpperCase() + ">",
336 "");
337 return input;
338 }
339
340 public String stripTags(String input) {
341 input = stripTag("script", input);
342 input = stripTag("style", input);
343 input = input.replaceAll("<!--.*?-->", "");
344
345 input = input.replaceAll(" ", " ");
346 input = input.replaceAll("<[^>]*>", " ");
347 input = input.replaceAll("//s+", " ");
348
349 return input;
350 }
351
352
353
354
355 public ArrayList<SnippetRegion> combineRegions(final ArrayList<SnippetRegion> regions) {
356 ArrayList<SnippetRegion> finalRegions = new ArrayList();
357 SnippetRegion last = null;
358 int snippetSize = 0;
359 int maxSize = 40;
360
361 for (int i = 0; i < regions.size(); i++) {
362 SnippetRegion current = regions.get(i);
363
364 if (last == null) {
365 last = current;
366 } else if (last.overlap(current)) {
367 SnippetRegion bigger = last.merge(current);
368
369 if (bigger.size() + snippetSize > maxSize) {
370 finalRegions.add(last);
371 last = null;
372 } else {
373 last = bigger;
374 }
375 } else if (last.size() + snippetSize > maxSize) {
376 break;
377 } else {
378 finalRegions.add(last);
379 snippetSize += last.size();
380 last = current;
381 }
382 }
383
384 if (last != null && snippetSize + last.size() < maxSize) {
385 finalRegions.add(last);
386 }
387
388 return finalRegions;
389 }
390 }