View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   
3   package org.galagosearch.core.parse;
4   
5   import java.io.IOException;
6   import java.util.Calendar;
7   import java.util.GregorianCalendar;
8   import java.util.HashMap;
9   import java.util.List;
10  import org.galagosearch.tupleflow.InputClass;
11  import org.galagosearch.tupleflow.OutputClass;
12  import org.galagosearch.tupleflow.StandardStep;
13  
14  /***
15   * A very crude extractor of dates from text.
16   * 
17   * This class searches for anything that looks like a year (1000-2999), then
18   * searches around that year for a month name.  A year is sufficient to emit
19   * a date.  Day of the month is currently not supported.
20   * 
21   * @author trevor
22   */
23  @InputClass(className = "org.galagosearch.core.parse.Document")
24  @OutputClass(className = "org.galagosearch.core.types.DateExtent")
25  public class DateExtractor extends StandardStep<Document, Document> {
26      HashMap<String, Integer> months = new HashMap<String, Integer>();
27      
28      public DateExtractor() {
29          addMonth("January", "Jan", Calendar.JANUARY);
30          addMonth("February", "Feb", Calendar.FEBRUARY);
31          addMonth("March", "Mar", Calendar.MARCH);
32          addMonth("April", "Apr", Calendar.APRIL);
33          addMonth("May", "May", Calendar.MAY);
34          addMonth("June", "Jun", Calendar.JUNE);
35          addMonth("July", "Jul", Calendar.JULY);
36          addMonth("August", "Aug", Calendar.AUGUST);
37          addMonth("September", "Sep", Calendar.SEPTEMBER);
38          addMonth("October", "Oct", Calendar.OCTOBER);
39          addMonth("November", "Nov", Calendar.NOVEMBER);
40          addMonth("December", "Dec", Calendar.DECEMBER);        
41      }
42      
43      public void addMonth(String longMonth, String shortMonth, int value) {
44          months.put(longMonth, value);
45          months.put(shortMonth, value);
46      }
47      
48      public boolean isMonth(String month) {
49          return months.containsKey(month);
50      }
51      
52      public boolean isYear(String year) {
53          if (year.length() != 4)
54              return false;
55          
56          char first = year.charAt(0);
57          if (first != '1' && first != '2')
58              return false;
59          
60          return Character.isDigit(year.charAt(1)) &&
61                 Character.isDigit(year.charAt(2)) &&
62                 Character.isDigit(year.charAt(3));
63      }
64      
65      public int getMonth(List<String> terms, int i) {
66          if (i > 0 && isMonth(terms.get(i-1))) {
67              return months.get(terms.get(i-1));
68          }
69          
70          if (i > 0 && isMonth(terms.get(i-2))) {
71              return months.get(terms.get(i-2));
72          }
73  
74          if (i < terms.size()-1 && isMonth(terms.get(i+1))) {
75              return months.get(terms.get(i+1));
76          }
77          
78          return 0;
79      }
80      
81      @Override
82      public void process(Document object) throws IOException {
83          for (int i = 0; i < object.terms.size(); ++i) {
84              String term = object.terms.get(i);
85              
86              if (isYear(term)) {
87                  int year = Integer.parseInt(term);
88                  int month = getMonth(object.terms, i);
89      
90                  Calendar calendar = new GregorianCalendar();
91                  calendar.set(year, month, 1);
92                  
93                  // TODO(trevor): add date extent
94                  // processor.process(new DateExtent(i, calendar.getTime()));
95              }
96          }
97      }
98  }