Coverage Report - org.galagosearch.core.parse.DateExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
DateExtractor
0%
0/41
0%
0/28
0
 
 1  
 // BSD License (http://www.galagosearch.org/license)
 2  
 
 3  
 package org.galagosearch.core.parse;
 4  
 
 5  
 import java.io.IOException;
 6  
 import java.util.Calendar;
 7  
 import java.util.GregorianCalendar;
 8  
 import java.util.HashMap;
 9  
 import java.util.List;
 10  
 import org.galagosearch.tupleflow.InputClass;
 11  
 import org.galagosearch.tupleflow.OutputClass;
 12  
 import org.galagosearch.tupleflow.StandardStep;
 13  
 
 14  
 /**
 15  
  * A very crude extractor of dates from text.
 16  
  * 
 17  
  * This class searches for anything that looks like a year (1000-2999), then
 18  
  * searches around that year for a month name.  A year is sufficient to emit
 19  
  * a date.  Day of the month is currently not supported.
 20  
  * 
 21  
  * @author trevor
 22  
  */
 23  
 @InputClass(className = "org.galagosearch.core.parse.Document")
 24  
 @OutputClass(className = "org.galagosearch.core.types.DateExtent")
 25  0
 public class DateExtractor extends StandardStep<Document, Document> {
 26  0
     HashMap<String, Integer> months = new HashMap<String, Integer>();
 27  
     
 28  0
     public DateExtractor() {
 29  0
         addMonth("January", "Jan", Calendar.JANUARY);
 30  0
         addMonth("February", "Feb", Calendar.FEBRUARY);
 31  0
         addMonth("March", "Mar", Calendar.MARCH);
 32  0
         addMonth("April", "Apr", Calendar.APRIL);
 33  0
         addMonth("May", "May", Calendar.MAY);
 34  0
         addMonth("June", "Jun", Calendar.JUNE);
 35  0
         addMonth("July", "Jul", Calendar.JULY);
 36  0
         addMonth("August", "Aug", Calendar.AUGUST);
 37  0
         addMonth("September", "Sep", Calendar.SEPTEMBER);
 38  0
         addMonth("October", "Oct", Calendar.OCTOBER);
 39  0
         addMonth("November", "Nov", Calendar.NOVEMBER);
 40  0
         addMonth("December", "Dec", Calendar.DECEMBER);        
 41  0
     }
 42  
     
 43  
     public void addMonth(String longMonth, String shortMonth, int value) {
 44  0
         months.put(longMonth, value);
 45  0
         months.put(shortMonth, value);
 46  0
     }
 47  
     
 48  
     public boolean isMonth(String month) {
 49  0
         return months.containsKey(month);
 50  
     }
 51  
     
 52  
     public boolean isYear(String year) {
 53  0
         if (year.length() != 4)
 54  0
             return false;
 55  
         
 56  0
         char first = year.charAt(0);
 57  0
         if (first != '1' && first != '2')
 58  0
             return false;
 59  
         
 60  0
         return Character.isDigit(year.charAt(1)) &&
 61  
                Character.isDigit(year.charAt(2)) &&
 62  
                Character.isDigit(year.charAt(3));
 63  
     }
 64  
     
 65  
     public int getMonth(List<String> terms, int i) {
 66  0
         if (i > 0 && isMonth(terms.get(i-1))) {
 67  0
             return months.get(terms.get(i-1));
 68  
         }
 69  
         
 70  0
         if (i > 0 && isMonth(terms.get(i-2))) {
 71  0
             return months.get(terms.get(i-2));
 72  
         }
 73  
 
 74  0
         if (i < terms.size()-1 && isMonth(terms.get(i+1))) {
 75  0
             return months.get(terms.get(i+1));
 76  
         }
 77  
         
 78  0
         return 0;
 79  
     }
 80  
     
 81  
     @Override
 82  
     public void process(Document object) throws IOException {
 83  0
         for (int i = 0; i < object.terms.size(); ++i) {
 84  0
             String term = object.terms.get(i);
 85  
             
 86  0
             if (isYear(term)) {
 87  0
                 int year = Integer.parseInt(term);
 88  0
                 int month = getMonth(object.terms, i);
 89  
     
 90  0
                 Calendar calendar = new GregorianCalendar();
 91  0
                 calendar.set(year, month, 1);
 92  
                 
 93  
                 // TODO(trevor): add date extent
 94  
                 // processor.process(new DateExtent(i, calendar.getTime()));
 95  
             }
 96  
         }
 97  0
     }
 98  
 }