Coverage Report - org.galagosearch.core.store.SnippetGenerator
 
Classes in this File Line Coverage Branch Coverage Complexity
SnippetGenerator
78%
71/91
58%
21/36
0
SnippetGenerator$Match
100%
7/7
N/A
0
SnippetGenerator$Snippet
8%
4/52
0%
0/26
0
SnippetGenerator$SnippetRegion
67%
18/27
11%
2/18
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.store;
 4  
 
 5  
 import java.io.IOException;
 6  
 import java.util.ArrayList;
 7  
 import java.util.Collections;
 8  
 import java.util.HashSet;
 9  
 import java.util.Set;
 10  
 import org.galagosearch.core.parse.Document;
 11  
 import org.galagosearch.core.parse.TagTokenizer;
 12  
 
 13  
 /**
 14  
  * This is a very simple snippet generator for generating small summaries of returned
 15  
  * documents.
 16  
  * 
 17  
  * @author trevor
 18  
  */
 19  8
 public class SnippetGenerator {
 20  
     public static final int width = 5;
 21  
 
 22  
     public static class Match {
 23  
         public Match(String term, int index) {
 24  8
             this(term, index, index + 1);
 25  8
         }
 26  
 
 27  8
         public Match(String term, int start, int end) {
 28  8
             this.term = term;
 29  8
             this.start = start;
 30  8
             this.end = end;
 31  8
         }
 32  
         String term;
 33  
         int start;
 34  
         int end;
 35  
     }
 36  
 
 37  
     public static class SnippetRegion {
 38  
         int start;
 39  
         int end;
 40  
         ArrayList<Match> matches;
 41  
 
 42  8
         public SnippetRegion(String term, int index, int width, int maximum) {
 43  8
             matches = new ArrayList();
 44  8
             matches.add(new Match(term, index));
 45  8
             start = Math.max(index - width, 0);
 46  8
             end = Math.min(maximum, index + width);
 47  8
         }
 48  
 
 49  4
         public SnippetRegion(ArrayList<Match> m, int s, int e) {
 50  4
             matches = m;
 51  4
             start = s;
 52  4
             end = e;
 53  4
         }
 54  
 
 55  
         public boolean overlap(SnippetRegion o) {
 56  4
             return (start <= o.start && end >= o.start) ||
 57  
                     (start <= o.end && end >= o.end);
 58  
         }
 59  
 
 60  
         public boolean within(SnippetRegion o, int distance) {
 61  0
             if (overlap(o)) {
 62  0
                 return true;
 63  
             }
 64  0
             if (Math.abs(start - o.end) <= distance) {
 65  0
                 return true;
 66  
             }
 67  0
             if (Math.abs(end - o.start) <= distance) {
 68  0
                 return true;
 69  
             }
 70  0
             return false;
 71  
         }
 72  
 
 73  
         public SnippetRegion merge(SnippetRegion o) {
 74  4
             ArrayList<Match> m = new ArrayList();
 75  4
             m.addAll(matches);
 76  4
             m.addAll(o.matches);
 77  
 
 78  4
             SnippetRegion result =
 79  
                     new SnippetRegion(m, Math.min(start, o.start), Math.max(end, o.end));
 80  4
             return result;
 81  
         }
 82  
 
 83  
         public boolean equals(SnippetRegion o) {
 84  0
             return start == o.start && end == o.end;
 85  
         }
 86  
 
 87  
         public int size() {
 88  8
             return this.end - this.start;
 89  
         }
 90  
 
 91  
         public ArrayList<Match> getMatches() {
 92  0
             return matches;
 93  
         }
 94  
     }
 95  
 
 96  8
     public class Snippet {
 97  
         private ArrayList<SnippetRegion> regions;
 98  
         double score;
 99  
 
 100  8
         public Snippet(ArrayList<SnippetRegion> regions) {
 101  8
             this.regions = regions;
 102  8
         }
 103  
 
 104  
         @Override
 105  
         public int hashCode() {
 106  0
             int result = 0;
 107  
 
 108  0
             for (SnippetRegion region : regions) {
 109  0
                 result += region.end * 3 + region.start;
 110  0
                 result *= 5;
 111  0
                 result += region.getMatches().size();
 112  
             }
 113  
 
 114  0
             return result;
 115  
         }
 116  
 
 117  
         @Override
 118  
         public boolean equals(Object o) {
 119  0
             if (!(o instanceof Snippet)) return false;
 120  0
             Snippet other = (Snippet) o;
 121  
 
 122  0
             if (other.regions.size() != regions.size()) {
 123  0
                 return false;
 124  
             }
 125  0
             for (int i = 0; i < regions.size(); i++) {
 126  0
                 if (regions.get(i).equals(other.regions.get(i))) {
 127  0
                     continue;
 128  
                 }
 129  0
                 return false;
 130  
             }
 131  
 
 132  0
             return true;
 133  
         }
 134  
 
 135  
         public double score() {
 136  0
             if (score == 0) {
 137  0
                 cacheScore();
 138  
             }
 139  0
             return score;
 140  
         }
 141  
 
 142  
         public void cacheScore() {
 143  
             // Factors:  big snippets are discounted
 144  
             //           coverage is good
 145  
             //           proximity is good
 146  
             //           close to document start is good
 147  
 
 148  0
             int wordLength = 0;
 149  0
             int prox = 0;
 150  
 
 151  0
             HashSet<String> words = new HashSet<String>();
 152  
 
 153  0
             for (SnippetRegion region : regions) {
 154  0
                 wordLength += region.size();
 155  0
                 prox += Math.pow(2, region.getMatches().size());
 156  
 
 157  0
                 for (SnippetGenerator.Match m : region.getMatches()) {
 158  0
                     words.add(m.term);
 159  
                 }
 160  
             }
 161  
 
 162  0
             score = -Math.pow(1.2, Math.min(0, wordLength - 150)) + prox + Math.pow(words.size(), 2);
 163  0
         }
 164  
 
 165  
         /**
 166  
          * <p>This is part of an aborted attempt to score many candidate snippets to produce
 167  
          * the best one.  It returns many different candidates which can then be scored
 168  
          * using the score method.  As coded, this method is too slow to be useful.</o>
 169  
          * 
 170  
          * @return
 171  
          */
 172  
         public ArrayList<Snippet> expand() {
 173  0
             ArrayList<Snippet> results = new ArrayList();
 174  0
             int size = 0;
 175  
 
 176  0
             for (SnippetRegion region : regions) {
 177  0
                 size += region.size();
 178  
             }
 179  
 
 180  0
             if (size > 150) {
 181  
                 // try deletions
 182  0
                 for (int i = 0; i < regions.size(); i++) {
 183  0
                     ArrayList<SnippetRegion> newRegions = new ArrayList();
 184  
 
 185  0
                     newRegions.addAll(regions.subList(0, i));
 186  0
                     newRegions.addAll(regions.subList(i + 1, regions.size()));
 187  
 
 188  0
                     results.add(new Snippet(newRegions));
 189  
                 }
 190  
             }
 191  
 
 192  
             // try merges
 193  0
             for (int i = 0; i < regions.size() - 1; i++) {
 194  0
                 if (regions.get(i + 1).start - regions.get(i).end > 100) {
 195  0
                     continue;
 196  
                 }
 197  0
                 ArrayList<SnippetRegion> newRegions = new ArrayList();
 198  
 
 199  0
                 newRegions.addAll(regions.subList(0, i));
 200  0
                 SnippetRegion merged = regions.get(i).merge(regions.get(i + 1));
 201  0
                 newRegions.add(merged);
 202  0
                 newRegions.addAll(regions.subList(i + 2, regions.size()));
 203  
 
 204  0
                 results.add(new Snippet(newRegions));
 205  
             }
 206  
 
 207  0
             return results;
 208  
         }
 209  
     }
 210  
 
 211  
     private Document parseAsDocument(String text, ArrayList<TagTokenizer.Pair> positions) throws IOException {
 212  8
         Document document = new Document();
 213  8
         document.text = text;
 214  
 
 215  
         // Tokenize the document
 216  8
         TagTokenizer tokenizer = new TagTokenizer();
 217  8
         tokenizer.process(document);
 218  
 
 219  8
         if (positions != null) {
 220  8
             positions.addAll(tokenizer.getTokenPositions());
 221  
         }
 222  8
         return document;
 223  
     }
 224  
 
 225  
     /**
 226  
      * <p>Highlights query terms in a string of document text.  This is most useful
 227  
      * for highlighting query terms in document titles.</p>
 228  
      */
 229  
 
 230  
     public String highlight(String documentText, Set<String> queryTerms) throws IOException {
 231  0
         ArrayList<TagTokenizer.Pair> positions = new ArrayList();
 232  0
         Document document = parseAsDocument(documentText, positions);
 233  
 
 234  0
         SnippetRegion merged = findSingleRegion(document, queryTerms);
 235  0
         Snippet best = new Snippet(new ArrayList(Collections.singletonList(merged)));
 236  
 
 237  0
         String result = buildHtmlString(best, document, positions);
 238  0
         return result;
 239  
     }
 240  
 
 241  
     /**
 242  
      * <p>Produces a short query-dependent summary of a document with query terms
 243  
      * highlighted.  The result is an HTML string.</p>
 244  
      */
 245  
 
 246  
     public String getSnippet(String documentText, Set<String> queryTerms) throws IOException {
 247  8
         ArrayList<TagTokenizer.Pair> positions = new ArrayList();
 248  8
         Document document = parseAsDocument(documentText, positions);
 249  
 
 250  8
         return generateSnippet(document, positions, queryTerms);
 251  
     }
 252  
 
 253  
     private String generateSnippet(
 254  
             final Document document,
 255  
             final ArrayList<TagTokenizer.Pair> positions,
 256  
             final Set<String> queryTerms) {
 257  8
         ArrayList<SnippetRegion> regions = findMatches(document, queryTerms);
 258  8
         ArrayList<SnippetRegion> finalRegions = combineRegions(regions);
 259  8
         Snippet best = new Snippet(finalRegions);
 260  
 
 261  8
         String result = buildHtmlString(best, document, positions);
 262  8
         return result;
 263  
     }
 264  
 
 265  
     private SnippetRegion findSingleRegion(final Document document, final Set<String> queryTerms) {
 266  
         // Make a snippet region object for each term occurrence in the document,
 267  
         // while also counting matches
 268  0
         ArrayList<Match> matches = new ArrayList();
 269  
 
 270  0
         for (int i = 0; i < document.terms.size(); i++) {
 271  0
             String term = document.terms.get(i);
 272  0
             if (queryTerms.contains(term)) {
 273  0
                 matches.add(new Match(term, i));
 274  
             }
 275  
         }
 276  
 
 277  0
         return new SnippetRegion(matches, 0, document.terms.size());
 278  
     }
 279  
 
 280  
     private ArrayList<SnippetRegion> findMatches(final Document document, final Set<String> queryTerms) {
 281  
         // Make a snippet region object for each term occurrence in the document,
 282  
         // while also counting matches
 283  8
         ArrayList<SnippetRegion> regions = new ArrayList();
 284  
 
 285  28
         for (int i = 0; i < document.terms.size(); i++) {
 286  20
             String term = document.terms.get(i);
 287  20
             if (queryTerms.contains(term)) {
 288  8
                 regions.add(new SnippetRegion(term, i, width, document.terms.size()));
 289  
             }
 290  
         }
 291  8
         return regions;
 292  
     }
 293  
 
 294  
     public String buildHtmlString(Snippet best, Document document, ArrayList<TagTokenizer.Pair> positions) {
 295  8
         StringBuilder builder = new StringBuilder();
 296  
 
 297  8
         for (SnippetRegion region : best.regions) {
 298  4
             if (region.start != 0) {
 299  0
                 builder.append("...");
 300  
             }
 301  4
             int startChar = positions.get(region.start).start;
 302  4
             int endChar = positions.get(region.end - 1).end;
 303  4
             int start = 0;
 304  
 
 305  
             // section string
 306  4
             String section = document.text.substring(startChar, endChar);
 307  
 
 308  4
             for (Match m : region.matches) {
 309  8
                 int startMatchChar = positions.get(m.start).start - startChar;
 310  8
                 int endMatchChar = positions.get(m.end - 1).end - startChar;
 311  
 
 312  8
                 String intermediate = stripTags(section.substring(start, startMatchChar));
 313  8
                 builder.append(intermediate);
 314  8
                 builder.append("<strong>");
 315  8
                 builder.append(stripTags(section.substring(startMatchChar, endMatchChar)));
 316  8
                 builder.append("</strong>");
 317  8
                 start = endMatchChar;
 318  8
             }
 319  
 
 320  4
             if (start >= 0) {
 321  4
                 builder.append(stripTags(section.substring(start)));
 322  
             }
 323  4
         }
 324  
 
 325  8
         if (best.regions.size() > 1 && best.regions.get(best.regions.size() - 1).end != document.terms.
 326  
                 size()) {
 327  0
             builder.append("...");
 328  
         }
 329  8
         return builder.toString();
 330  
     }
 331  
 
 332  
     public String stripTag(String tag, String input) {
 333  40
         input = input.replaceAll("<" + tag.toLowerCase() + "[^>]*>.*?</" + tag.toLowerCase() + ">",
 334  
                                  "");
 335  40
         input = input.replaceAll("<" + tag.toUpperCase() + "[^>]*>.*?</" + tag.toUpperCase() + ">",
 336  
                                  "");
 337  40
         return input;
 338  
     }
 339  
 
 340  
     public String stripTags(String input) {
 341  20
         input = stripTag("script", input);
 342  20
         input = stripTag("style", input);
 343  20
         input = input.replaceAll("<!--.*?-->", "");
 344  
 
 345  20
         input = input.replaceAll("&nbsp;", " ");
 346  20
         input = input.replaceAll("<[^>]*>", " ");
 347  20
         input = input.replaceAll("\\s+", " ");
 348  
 
 349  20
         return input;
 350  
     }
 351  
     // Goals:  1. find as many terms as possible
 352  
     //         2. find terms that are close together
 353  
     //         3. break on sentences when possible (?)
 354  
     // BUGBUG: might not have all the terms highlighted here
 355  
     public ArrayList<SnippetRegion> combineRegions(final ArrayList<SnippetRegion> regions) {
 356  8
         ArrayList<SnippetRegion> finalRegions = new ArrayList();
 357  8
         SnippetRegion last = null;
 358  8
         int snippetSize = 0;
 359  8
         int maxSize = 40;
 360  
 
 361  16
         for (int i = 0; i < regions.size(); i++) {
 362  8
             SnippetRegion current = regions.get(i);
 363  
 
 364  8
             if (last == null) {
 365  4
                 last = current;
 366  4
             } else if (last.overlap(current)) {
 367  4
                 SnippetRegion bigger = last.merge(current);
 368  
 
 369  4
                 if (bigger.size() + snippetSize > maxSize) {
 370  0
                     finalRegions.add(last);
 371  0
                     last = null;
 372  
                 } else {
 373  4
                     last = bigger;
 374  
                 }
 375  4
             } else if (last.size() + snippetSize > maxSize) {
 376  0
                 break;
 377  
             } else {
 378  0
                 finalRegions.add(last);
 379  0
                 snippetSize += last.size();
 380  0
                 last = current;
 381  
             }
 382  
         }
 383  
 
 384  8
         if (last != null && snippetSize + last.size() < maxSize) {
 385  4
             finalRegions.add(last);
 386  
         }
 387  
 
 388  8
         return finalRegions;
 389  
     }
 390  
 }