1
2
3 package org.galagosearch.core.parse;
4
5 import java.io.IOException;
6 import java.util.Calendar;
7 import java.util.GregorianCalendar;
8 import java.util.HashMap;
9 import java.util.List;
10 import org.galagosearch.tupleflow.InputClass;
11 import org.galagosearch.tupleflow.OutputClass;
12 import org.galagosearch.tupleflow.StandardStep;
13
14 /***
15 * A very crude extractor of dates from text.
16 *
17 * This class searches for anything that looks like a year (1000-2999), then
18 * searches around that year for a month name. A year is sufficient to emit
19 * a date. Day of the month is currently not supported.
20 *
21 * @author trevor
22 */
23 @InputClass(className = "org.galagosearch.core.parse.Document")
24 @OutputClass(className = "org.galagosearch.core.types.DateExtent")
25 public class DateExtractor extends StandardStep<Document, Document> {
26 HashMap<String, Integer> months = new HashMap<String, Integer>();
27
28 public DateExtractor() {
29 addMonth("January", "Jan", Calendar.JANUARY);
30 addMonth("February", "Feb", Calendar.FEBRUARY);
31 addMonth("March", "Mar", Calendar.MARCH);
32 addMonth("April", "Apr", Calendar.APRIL);
33 addMonth("May", "May", Calendar.MAY);
34 addMonth("June", "Jun", Calendar.JUNE);
35 addMonth("July", "Jul", Calendar.JULY);
36 addMonth("August", "Aug", Calendar.AUGUST);
37 addMonth("September", "Sep", Calendar.SEPTEMBER);
38 addMonth("October", "Oct", Calendar.OCTOBER);
39 addMonth("November", "Nov", Calendar.NOVEMBER);
40 addMonth("December", "Dec", Calendar.DECEMBER);
41 }
42
43 public void addMonth(String longMonth, String shortMonth, int value) {
44 months.put(longMonth, value);
45 months.put(shortMonth, value);
46 }
47
48 public boolean isMonth(String month) {
49 return months.containsKey(month);
50 }
51
52 public boolean isYear(String year) {
53 if (year.length() != 4)
54 return false;
55
56 char first = year.charAt(0);
57 if (first != '1' && first != '2')
58 return false;
59
60 return Character.isDigit(year.charAt(1)) &&
61 Character.isDigit(year.charAt(2)) &&
62 Character.isDigit(year.charAt(3));
63 }
64
65 public int getMonth(List<String> terms, int i) {
66 if (i > 0 && isMonth(terms.get(i-1))) {
67 return months.get(terms.get(i-1));
68 }
69
70 if (i > 0 && isMonth(terms.get(i-2))) {
71 return months.get(terms.get(i-2));
72 }
73
74 if (i < terms.size()-1 && isMonth(terms.get(i+1))) {
75 return months.get(terms.get(i+1));
76 }
77
78 return 0;
79 }
80
81 @Override
82 public void process(Document object) throws IOException {
83 for (int i = 0; i < object.terms.size(); ++i) {
84 String term = object.terms.get(i);
85
86 if (isYear(term)) {
87 int year = Integer.parseInt(term);
88 int month = getMonth(object.terms, i);
89
90 Calendar calendar = new GregorianCalendar();
91 calendar.set(year, month, 1);
92
93
94
95 }
96 }
97 }
98 }