| 1 | |
|
| 2 | |
|
| 3 | |
package org.galagosearch.core.parse; |
| 4 | |
|
| 5 | |
import java.io.IOException; |
| 6 | |
import java.util.Calendar; |
| 7 | |
import java.util.GregorianCalendar; |
| 8 | |
import java.util.HashMap; |
| 9 | |
import java.util.List; |
| 10 | |
import org.galagosearch.tupleflow.InputClass; |
| 11 | |
import org.galagosearch.tupleflow.OutputClass; |
| 12 | |
import org.galagosearch.tupleflow.StandardStep; |
| 13 | |
|
| 14 | |
|
| 15 | |
|
| 16 | |
|
| 17 | |
|
| 18 | |
|
| 19 | |
|
| 20 | |
|
| 21 | |
|
| 22 | |
|
| 23 | |
@InputClass(className = "org.galagosearch.core.parse.Document") |
| 24 | |
@OutputClass(className = "org.galagosearch.core.types.DateExtent") |
| 25 | 0 | public class DateExtractor extends StandardStep<Document, Document> { |
| 26 | 0 | HashMap<String, Integer> months = new HashMap<String, Integer>(); |
| 27 | |
|
| 28 | 0 | public DateExtractor() { |
| 29 | 0 | addMonth("January", "Jan", Calendar.JANUARY); |
| 30 | 0 | addMonth("February", "Feb", Calendar.FEBRUARY); |
| 31 | 0 | addMonth("March", "Mar", Calendar.MARCH); |
| 32 | 0 | addMonth("April", "Apr", Calendar.APRIL); |
| 33 | 0 | addMonth("May", "May", Calendar.MAY); |
| 34 | 0 | addMonth("June", "Jun", Calendar.JUNE); |
| 35 | 0 | addMonth("July", "Jul", Calendar.JULY); |
| 36 | 0 | addMonth("August", "Aug", Calendar.AUGUST); |
| 37 | 0 | addMonth("September", "Sep", Calendar.SEPTEMBER); |
| 38 | 0 | addMonth("October", "Oct", Calendar.OCTOBER); |
| 39 | 0 | addMonth("November", "Nov", Calendar.NOVEMBER); |
| 40 | 0 | addMonth("December", "Dec", Calendar.DECEMBER); |
| 41 | 0 | } |
| 42 | |
|
| 43 | |
public void addMonth(String longMonth, String shortMonth, int value) { |
| 44 | 0 | months.put(longMonth, value); |
| 45 | 0 | months.put(shortMonth, value); |
| 46 | 0 | } |
| 47 | |
|
| 48 | |
public boolean isMonth(String month) { |
| 49 | 0 | return months.containsKey(month); |
| 50 | |
} |
| 51 | |
|
| 52 | |
public boolean isYear(String year) { |
| 53 | 0 | if (year.length() != 4) |
| 54 | 0 | return false; |
| 55 | |
|
| 56 | 0 | char first = year.charAt(0); |
| 57 | 0 | if (first != '1' && first != '2') |
| 58 | 0 | return false; |
| 59 | |
|
| 60 | 0 | return Character.isDigit(year.charAt(1)) && |
| 61 | |
Character.isDigit(year.charAt(2)) && |
| 62 | |
Character.isDigit(year.charAt(3)); |
| 63 | |
} |
| 64 | |
|
| 65 | |
public int getMonth(List<String> terms, int i) { |
| 66 | 0 | if (i > 0 && isMonth(terms.get(i-1))) { |
| 67 | 0 | return months.get(terms.get(i-1)); |
| 68 | |
} |
| 69 | |
|
| 70 | 0 | if (i > 0 && isMonth(terms.get(i-2))) { |
| 71 | 0 | return months.get(terms.get(i-2)); |
| 72 | |
} |
| 73 | |
|
| 74 | 0 | if (i < terms.size()-1 && isMonth(terms.get(i+1))) { |
| 75 | 0 | return months.get(terms.get(i+1)); |
| 76 | |
} |
| 77 | |
|
| 78 | 0 | return 0; |
| 79 | |
} |
| 80 | |
|
| 81 | |
@Override |
| 82 | |
public void process(Document object) throws IOException { |
| 83 | 0 | for (int i = 0; i < object.terms.size(); ++i) { |
| 84 | 0 | String term = object.terms.get(i); |
| 85 | |
|
| 86 | 0 | if (isYear(term)) { |
| 87 | 0 | int year = Integer.parseInt(term); |
| 88 | 0 | int month = getMonth(object.terms, i); |
| 89 | |
|
| 90 | 0 | Calendar calendar = new GregorianCalendar(); |
| 91 | 0 | calendar.set(year, month, 1); |
| 92 | |
|
| 93 | |
|
| 94 | |
|
| 95 | |
} |
| 96 | |
} |
| 97 | 0 | } |
| 98 | |
} |