Coverage Report - org.galagosearch.core.parse.WordCounter
 
Classes in this File Line Coverage Branch Coverage Complexity
WordCounter
0%
0/49
0%
0/32
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 package org.galagosearch.core.parse;
 3  
 
 4  
 import java.io.File;
 5  
 import java.io.IOException;
 6  
 import java.util.ArrayList;
 7  
 import java.util.HashMap;
 8  
 import java.util.HashSet;
 9  
 import java.util.List;
 10  
 import org.galagosearch.core.types.WordCount;
 11  
 import org.galagosearch.tupleflow.InputClass;
 12  
 import org.galagosearch.tupleflow.OutputClass;
 13  
 import org.galagosearch.tupleflow.Reducer;
 14  
 import org.galagosearch.tupleflow.StandardStep;
 15  
 import org.galagosearch.tupleflow.TupleFlowParameters;
 16  
 import org.galagosearch.tupleflow.Utility;
 17  
 import org.galagosearch.tupleflow.execution.Verified;
 18  
 
 19  
 /**
 20  
  *
 21  
  * @author trevor
 22  
  */
 23  
 @Verified
 24  
 @InputClass(className = "org.galagosearch.core.parse.Document")
 25  
 @OutputClass(className = "org.galagosearch.core.types.WordCount")
 26  0
 public class WordCounter extends StandardStep<Document, WordCount> implements Reducer<WordCount> {
 27  0
     int maxWidth = 1;
 28  
     HashSet<String> filterWords;
 29  
 
 30  0
     public WordCounter(TupleFlowParameters parameters) throws IOException {
 31  0
         maxWidth = (int) parameters.getXML().get("width", 1);
 32  0
         String filename = parameters.getXML().get("filter", (String) null);
 33  0
         if (filename != null) {
 34  0
             filterWords = Utility.readFileToStringSet(new File(filename));
 35  
         } else {
 36  0
             filterWords = null;
 37  
         }
 38  0
     }
 39  
 
 40  
     public void process(Document document) throws IOException {
 41  0
         List<String> tokens = document.terms;
 42  0
         HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>();
 43  
 
 44  0
         for (int i = 0; i < tokens.size(); i++) {
 45  0
             String token = tokens.get(i);
 46  
 
 47  0
             if (token == null) {
 48  0
                 continue;
 49  
             }
 50  0
             updateCounts(token, countObjects);
 51  
 
 52  0
             if (maxWidth > 1) {
 53  0
                 StringBuilder builder = new StringBuilder();
 54  0
                 builder.append(token);
 55  
 
 56  0
                 int end = Math.min(i + maxWidth, tokens.size());
 57  0
                 for (int j = i + 1; j < end; j++) {
 58  0
                     token = tokens.get(j);
 59  
 
 60  0
                     if (token == null) {
 61  0
                         break;
 62  
                     }
 63  0
                     builder.append(' ');
 64  0
                     builder.append(token);
 65  
 
 66  0
                     updateCounts(builder.toString(), countObjects);
 67  
                 }
 68  
             }
 69  
         }
 70  
 
 71  0
         for (WordCount count : countObjects.values()) {
 72  0
             assert count != null;
 73  0
             assert count.word != null;
 74  0
             processor.process(count);
 75  
         }
 76  0
     }
 77  
 
 78  
     public ArrayList<WordCount> reduce(List<WordCount> input) throws IOException {
 79  0
         HashMap<String, WordCount> countObjects = new HashMap<String, WordCount>(input.size() / 5);
 80  
 
 81  0
         for (WordCount wordCount : input) {
 82  0
             WordCount original = countObjects.get(wordCount.word);
 83  
 
 84  0
             if (original == null) {
 85  0
                 countObjects.put(wordCount.word, original);
 86  
             } else {
 87  0
                 original.documents += wordCount.documents;
 88  0
                 original.count += wordCount.count;
 89  
             }
 90  0
         }
 91  
 
 92  0
         return new ArrayList<WordCount>(countObjects.values());
 93  
     }
 94  
 
 95  
     void updateCounts(String token, HashMap<String, WordCount> countObjects) {
 96  0
         WordCount wordCount = countObjects.get(token);
 97  
 
 98  0
         if (filterWords != null && !filterWords.contains(token)) {
 99  0
             return;
 100  
         }
 101  0
         if (wordCount != null) {
 102  0
             wordCount.count += 1;
 103  
         } else {
 104  0
             wordCount = new WordCount(new String(token), 1, 1);
 105  0
             countObjects.put(token, wordCount);
 106  
         }
 107  0
     }
 108  
 }