1
2 package org.galagosearch.core.parse;
3
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Collections;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.Map;
10 import java.util.logging.Level;
11 import java.util.logging.Logger;
12 import org.galagosearch.tupleflow.IncompatibleProcessorException;
13 import org.galagosearch.tupleflow.InputClass;
14 import org.galagosearch.tupleflow.Linkage;
15 import org.galagosearch.tupleflow.NullProcessor;
16 import org.galagosearch.tupleflow.OutputClass;
17 import org.galagosearch.tupleflow.Processor;
18 import org.galagosearch.tupleflow.Source;
19 import org.galagosearch.tupleflow.Step;
20 import org.galagosearch.tupleflow.Utility;
21 import org.galagosearch.tupleflow.execution.Verified;
22
23 /***
24 * <p>This class processes document text into tokens that can be indexed.</p>
25 *
26 * <p>The text is assumed to contain some HTML/XML tags. The tokenizer tries
27 * to extract as much data as possible from each document, even if it is not
28 * well formed (e.g. there are start tags with no ending tags). The resulting
29 * document object contains an array of terms and an array of tags.</p>
30 *
31 * @author trevor
32 */
33 @Verified
34 @InputClass(className = "org.galagosearch.core.parse.Document")
35 @OutputClass(className = "org.galagosearch.core.parse.Document")
36 public class TagTokenizer implements Source<Document>, Processor<Document> {
37 private static final boolean[] splits;
38 private static HashSet<String> ignoredTags;
39 public Processor<Document> processor = new NullProcessor(Document.class);
40 private StringPooler pooler = new StringPooler();
41 private String ignoreUntil;
42
43
44 static {
45 splits = buildSplits();
46 ignoredTags = buildIgnoredTags();
47 }
48 private String text;
49 private int position;
50 private int lastSplit;
51 ArrayList<String> tokens;
52 HashMap<String, ArrayList<BeginTag>> openTags;
53 ArrayList<ClosedTag> closedTags;
54 ArrayList<Pair> tokenPositions;
55
56 public static class Pair {
57 public Pair(int start, int end) {
58 this.start = start;
59 this.end = end;
60 }
61 public int start;
62 public int end;
63
64 public String toString() {
65 return String.format("%d,%d", start, end);
66 }
67 }
68
69 private enum StringStatus {
70 Clean,
71 NeedsSimpleFix,
72 NeedsComplexFix,
73 NeedsAcronymProcessing
74 }
75
76 public TagTokenizer() {
77 text = null;
78 position = 0;
79 lastSplit = -1;
80
81 tokens = new ArrayList<String>();
82 openTags = new HashMap<String, ArrayList<BeginTag>>();
83 closedTags = new ArrayList<ClosedTag>();
84 tokenPositions = new ArrayList<Pair>();
85 }
86
87 private static boolean[] buildSplits() {
88 boolean[] localSplits = new boolean[257];
89
90 for (int i = 0; i < localSplits.length; i++) {
91 localSplits[i] = false;
92 }
93 char[] splitChars = {' ', '\t', '\n', '\r',
94 ';', '\"', '&', '/', ':', '!', '#',
95 '?', '$', '%', '(', ')', '@', '^',
96 '*', '+', '-', ',', '=', '>', '<', '[',
97 ']', '{', '}', '|', '`', '~', '_'
98 };
99
100 for (char c : splitChars) {
101 localSplits[(byte) c] = true;
102 }
103
104 for (byte c = 0; c <= 32; c++) {
105 localSplits[c] = true;
106 }
107
108 return localSplits;
109 }
110
111 private static HashSet<String> buildIgnoredTags() {
112 HashSet<String> tags = new HashSet<String>();
113 tags.add("style");
114 tags.add("script");
115 return tags;
116 }
117
118 class ClosedTag {
119 public ClosedTag(BeginTag begin) {
120 this.name = begin.name;
121 this.attributes = begin.attributes;
122
123 this.byteStart = begin.bytePosition;
124 this.termStart = begin.termPosition;
125
126 this.byteEnd = position;
127 this.termEnd = tokens.size();
128 }
129 String name;
130 Map<String, String> attributes;
131 int byteStart;
132 int termStart;
133 int byteEnd;
134 int termEnd;
135 }
136
137 class BeginTag {
138 public BeginTag(String name, Map<String, String> attributes) {
139 this.name = name;
140 this.attributes = attributes;
141
142 this.bytePosition = position;
143 this.termPosition = tokens.size();
144 }
145 String name;
146 Map<String, String> attributes;
147 int bytePosition;
148 int termPosition;
149 }
150
151 /***
152 * Resets parsing in preparation for the next document.
153 */
154 public void reset() {
155 ignoreUntil = null;
156 text = null;
157 position = 0;
158 lastSplit = -1;
159
160 tokens.clear();
161 openTags.clear();
162 closedTags.clear();
163
164 if (tokenPositions != null) {
165 tokenPositions.clear();
166 }
167 }
168
169 private void skipComment() {
170 if (text.substring(position).startsWith("<!--")) {
171 position = text.indexOf("-->", position + 1);
172
173 if (position >= 0) {
174 position += 2;
175 }
176 } else {
177 position = text.indexOf(">", position + 1);
178 }
179
180 if (position < 0) {
181 position = text.length();
182 }
183 }
184
185 private void skipProcessingInstruction() {
186 position = text.indexOf("?>", position + 1);
187
188 if (position < 0) {
189 position = text.length();
190 }
191 }
192
193 private void parseEndTag() {
194
195 int i;
196
197 for (i = position + 2; i < text.length(); i++) {
198 char c = text.charAt(i);
199 if (Character.isSpaceChar(c) || c == '>') {
200 break;
201 }
202 }
203
204 String tagName = text.substring(position + 2, i).toLowerCase();
205
206 if (ignoreUntil != null && ignoreUntil.equals(tagName)) {
207 ignoreUntil = null;
208 }
209 if (ignoreUntil == null) {
210 closeTag(tagName);
211 }
212 while (i < text.length() && text.charAt(i) != '>') {
213 i++;
214 }
215 position = i;
216 }
217
218 private void closeTag(final String tagName) {
219 if (!openTags.containsKey(tagName)) {
220 return;
221 }
222 ArrayList<BeginTag> tagList = openTags.get(tagName);
223
224 if (tagList.size() > 0) {
225 int last = tagList.size() - 1;
226
227 BeginTag openTag = tagList.get(last);
228 ClosedTag closedTag = new ClosedTag(openTag);
229 closedTags.add(closedTag);
230
231 tagList.remove(last);
232 }
233 }
234
235 private int indexOfNonSpace(int start) {
236 if (start < 0) {
237 return Integer.MIN_VALUE;
238 }
239 for (int i = start; i < text.length(); i++) {
240 char c = text.charAt(i);
241 if (!Character.isSpaceChar(c)) {
242 return i;
243 }
244 }
245
246 return Integer.MIN_VALUE;
247 }
248
249 private int indexOfEndAttribute(int start, int tagEnd) {
250 if (start < 0) {
251 return Integer.MIN_VALUE;
252
253 }
254 boolean inQuote = false;
255 boolean lastEscape = false;
256
257 for (int i = start; i <= tagEnd; i++) {
258 char c = text.charAt(i);
259
260 if ((c == '\"' || c == '\'') && !lastEscape) {
261 inQuote = !inQuote;
262 if (!inQuote) {
263 return i;
264 }
265 } else if (!inQuote && (Character.isSpaceChar(c) || c == '>')) {
266 return i;
267 } else if (c == '//' && !lastEscape) {
268 lastEscape = true;
269 } else {
270 lastEscape = false;
271 }
272 }
273
274 return Integer.MIN_VALUE;
275 }
276
277 private int indexOfSpace(int start) {
278 if (start < 0) {
279 return Integer.MIN_VALUE;
280 }
281 for (int i = start; i < text.length(); i++) {
282 char c = text.charAt(i);
283 if (Character.isSpaceChar(c)) {
284 return i;
285 }
286 }
287
288 return Integer.MIN_VALUE;
289 }
290
291 private int indexOfEquals(int start, int end) {
292 if (start < 0) {
293 return Integer.MIN_VALUE;
294 }
295 for (int i = start; i < end; i++) {
296 char c = text.charAt(i);
297 if (c == '=') {
298 return i;
299 }
300 }
301
302 return Integer.MIN_VALUE;
303 }
304
305 private void parseBeginTag() {
306
307 int i;
308
309 for (i = position + 1; i < text.length(); i++) {
310 char c = text.charAt(i);
311 if (Character.isSpaceChar(c) || c == '>') {
312 break;
313 }
314 }
315
316 String tagName = text.substring(position + 1, i).toLowerCase();
317
318
319 i = indexOfNonSpace(i);
320 int tagEnd = text.indexOf(">", i + 1);
321 boolean closeIt = false;
322
323 HashMap<String, String> attributes = new HashMap<String, String>();
324 while (i < tagEnd && i >= 0 && tagEnd >= 0) {
325
326 int start = indexOfNonSpace(i);
327
328 if (start > 0) {
329 if (text.charAt(start) == '>') {
330 i = start;
331 break;
332 } else if (text.charAt(start) == '/' &&
333 text.length() > start + 1 &&
334 text.charAt(start + 1) == '>') {
335 i = start + 1;
336 closeIt = true;
337 break;
338 }
339 }
340
341 int end = indexOfEndAttribute(start, tagEnd);
342 int equals = indexOfEquals(start, end);
343
344
345 if (equals < 0 || equals == start || end == equals) {
346
347 if (end < 0) {
348 i = tagEnd;
349 break;
350 } else {
351 i = end;
352 continue;
353 }
354 }
355
356
357 int startKey = start;
358 int endKey = equals;
359
360 int startValue = equals + 1;
361 int endValue = end;
362
363 if (text.charAt(startValue) == '\"' || text.charAt(startValue) == '\'') {
364 startValue++;
365 }
366 if (startValue >= endValue || startKey >= endKey) {
367 i = end;
368 continue;
369 }
370
371 String key = text.substring(startKey, endKey);
372 String value = text.substring(startValue, endValue);
373
374 attributes.put(key.toLowerCase(), value);
375
376 if (end >= text.length()) {
377 endParsing();
378 break;
379 }
380
381 if (text.charAt(end) == '\"' || text.charAt(end) == '\'') {
382 end++;
383 }
384
385 i = end;
386 }
387
388 if (!ignoredTags.contains(tagName)) {
389 BeginTag tag = new BeginTag(tagName, attributes);
390
391 if (!openTags.containsKey(tagName)) {
392 ArrayList tagList = new ArrayList();
393 tagList.add(tag);
394 openTags.put(tagName, tagList);
395 } else {
396 openTags.get(tagName).add(tag);
397 }
398
399 if (closeIt) {
400 closeTag(tagName);
401 }
402 } else if (!closeIt) {
403 ignoreUntil = tagName;
404 }
405
406 position = i;
407 }
408
409 private void endParsing() {
410 position = text.length();
411 }
412
413 private void onSplit() {
414 if (position - lastSplit > 1) {
415 int start = lastSplit + 1;
416 String token = text.substring(start, position);
417 StringStatus status = checkTokenStatus(token);
418
419 switch (status) {
420 case NeedsSimpleFix:
421 token = tokenSimpleFix(token);
422 break;
423
424 case NeedsComplexFix:
425 token = tokenComplexFix(token);
426 break;
427
428 case NeedsAcronymProcessing:
429 tokenAcronymProcessing(token, start, position);
430 break;
431
432 case Clean:
433
434 break;
435 }
436
437 if (status != StringStatus.NeedsAcronymProcessing) {
438 addToken(token, start, position);
439 }
440 }
441
442 lastSplit = position;
443 }
444
445 /***
446 * Adds a token to the document object. This method currently drops tokens
447 * longer than 100 bytes long right now.
448 *
449 * @param token The token to add.
450 * @param start The starting byte offset of the token in the document text.
451 * @param end The ending byte offset of the token in the document text.
452 */
453 private void addToken(final String token, int start, int end) {
454 final int maxTokenLength = 100;
455
456 if (token.length() <= 0) {
457 return;
458 }
459
460
461 if (token.length() > maxTokenLength / 6 &&
462 Utility.makeBytes(token).length >= maxTokenLength) {
463 return;
464 }
465 tokens.add(token);
466 tokenPositions.add(new Pair(start, end));
467 }
468
469 private String tokenComplexFix(String token) {
470 token = tokenSimpleFix(token);
471 token = token.toLowerCase();
472
473 return token;
474 }
475
476 /***
477 * This method does three kinds of processing:
478 * <ul>
479 * <li>If the token contains periods at the beginning or the end,
480 * they are removed.</li>
481 * <li>If the token contains single letters followed by periods, such
482 * as I.B.M., C.I.A., or U.S.A., the periods are removed.</li>
483 * <li>If, instead, the token contains longer strings of text with
484 * periods in the middle, the token is split into
485 * smaller tokens ("umass.edu" becomes {"umass", "edu"}). Notice
486 * that this means ("ph.d." becomes {"ph", "d"}).</li>
487 * </ul>
488 *
489 * @param token
490 * @param start
491 * @param end
492 */
493 private void tokenAcronymProcessing(String token, int start, int end) {
494 token = tokenComplexFix(token);
495
496
497 while (token.startsWith(".")) {
498 token = token.substring(1);
499 start = start + 1;
500 }
501
502 while (token.endsWith(".")) {
503 token = token.substring(0, token.length() - 1);
504 end -= 1;
505 }
506
507
508 if (token.indexOf('.') >= 0) {
509
510
511 boolean isAcronym = token.length() > 0;
512 for (int pos = 1; pos < token.length(); pos += 2) {
513 if (token.charAt(pos) != '.') {
514 isAcronym = false;
515 }
516 }
517
518 if (isAcronym) {
519 token = token.replace(".", "");
520 addToken(token, start, end);
521 } else {
522 int s = 0;
523 for (int e = 0; e < token.length(); e++) {
524 if (token.charAt(e) == '.') {
525 if (e - s > 1) {
526 String subtoken = token.substring(s, e);
527 addToken(subtoken, start + s, start + e);
528 }
529 s = e + 1;
530 }
531 }
532
533 if (token.length() - s > 1) {
534 String subtoken = token.substring(s);
535 addToken(subtoken, start + s, end);
536 }
537 }
538 } else {
539 addToken(token, start, end);
540 }
541 }
542
543 /***
544 * Scans through the token, removing apostrophes and converting
545 * uppercase to lowercase letters.
546 *
547 * @param token
548 * @return
549 */
550 private String tokenSimpleFix(String token) {
551 char[] chars = token.toCharArray();
552 int j = 0;
553
554 for (int i = 0; i < chars.length; i++) {
555 char c = chars[i];
556 boolean isAsciiUppercase = (c >= 'A' && c <= 'Z');
557 boolean isApostrophe = (c == '\'');
558
559 if (isAsciiUppercase) {
560 chars[j] = (char) (chars[i] + 'a' - 'A');
561 } else if (isApostrophe) {
562
563 j--;
564 } else {
565 chars[j] = chars[i];
566 }
567
568 j++;
569 }
570
571 token = new String(chars, 0, j);
572 return token;
573 }
574
575 /***
576 * This method scans the token, looking for uppercase characters and
577 * special characters. If the token contains only numbers and lowercase
578 * letters, it needs no further processing, and it returns Clean.
579 * If it also contains uppercase letters or apostrophes, it returns
580 * NeedsSimpleFix. If it contains special characters (especially Unicode
581 * characters), it returns NeedsComplexFix. Finally, if any periods are
582 * present, this returns NeedsAcronymProcessing.
583 *
584 * @param token
585 * @return
586 */
587 private StringStatus checkTokenStatus(final String token) {
588 StringStatus status = StringStatus.Clean;
589 char[] chars = token.toCharArray();
590
591 for (int i = 0; i < chars.length; i++) {
592 char c = chars[i];
593 boolean isAsciiLowercase = (c >= 'a' && c <= 'z');
594 boolean isAsciiNumber = (c >= '0' && c <= '9');
595
596 if (isAsciiLowercase || isAsciiNumber) {
597 continue;
598 }
599 boolean isAsciiUppercase = (c >= 'A' && c <= 'Z');
600 boolean isPeriod = (c == '.');
601 boolean isApostrophe = (c == '\'');
602
603 if ((isAsciiUppercase || isApostrophe) && status == StringStatus.Clean) {
604 status = StringStatus.NeedsSimpleFix;
605 } else if (!isPeriod) {
606 status = StringStatus.NeedsComplexFix;
607 } else {
608 status = StringStatus.NeedsAcronymProcessing;
609 break;
610 }
611 }
612
613 return status;
614 }
615
616 private void onStartBracket() {
617 if (position + 1 < text.length()) {
618 char c = text.charAt(position + 1);
619
620 if (c == '/') {
621 parseEndTag();
622 } else if (c == '!') {
623 skipComment();
624 } else if (c == '?') {
625 skipProcessingInstruction();
626 } else {
627 parseBeginTag();
628 }
629 } else {
630 endParsing();
631 }
632
633 lastSplit = position;
634 }
635
636 /***
637 * Translates tags from the internal ClosedTag format to the
638 * Tag type.
639 */
640 private ArrayList<Tag> coalesceTags() {
641 ArrayList<Tag> result = new ArrayList();
642
643
644 for (ArrayList<BeginTag> tagList : openTags.values()) {
645 for (BeginTag tag : tagList) {
646 result.add(new Tag(tag.name, tag.attributes, tag.termPosition, tag.termPosition));
647 }
648 }
649
650 for (ClosedTag tag : closedTags) {
651 result.add(new Tag(tag.name, tag.attributes, tag.termStart, tag.termEnd));
652 }
653
654 Collections.sort(result);
655 return result;
656 }
657
658 public void onAmpersand() {
659 onSplit();
660
661 for (int i = position + 1; i < text.length(); i++) {
662 char c = text.charAt(i);
663
664 if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9' || c == '#') {
665 continue;
666 }
667 if (c == ';') {
668 position = i;
669 lastSplit = i;
670 return;
671 }
672
673
674 break;
675 }
676 }
677
678 /***
679 * Parses the text in the document.text attribute and fills in the
680 * document.terms and document.tags arrays, then passes that document
681 * to the next processing stage.
682 *
683 * @param document
684 * @throws java.io.IOException
685 */
686 public void process(Document document) throws IOException {
687 tokenize(document);
688 processor.process(document);
689 }
690
691 /***
692 * Parses the text in the document.text attribute and fills in the
693 * document.terms and document.tags arrays.
694 *
695 * @param document
696 * @throws java.io.IOException
697 */
698 public void tokenize(Document document) {
699 reset();
700 text = document.text;
701
702 try {
703
704
705
706
707
708
709 for (; position >= 0 && position < text.length(); position++) {
710 char c = text.charAt(position);
711
712 if (c == '<') {
713 if (ignoreUntil == null) {
714 onSplit();
715 }
716 onStartBracket();
717 } else if (ignoreUntil != null) {
718 continue;
719 } else if (c == '&') {
720 onAmpersand();
721 } else if (c < 256 && splits[c]) {
722 onSplit();
723 }
724 }
725 } catch (Exception e) {
726 Logger.getLogger(getClass().toString()).log(Level.WARNING,
727 "Parse failure: " + document.identifier);
728 }
729
730 if (ignoreUntil == null) {
731 onSplit();
732 }
733 document.terms = new ArrayList<String>(this.tokens);
734 document.tags = coalesceTags();
735 pooler.transform(document);
736 }
737
738 /***
739 * Parses the text in the input string and returns a document object.
740 * This method calls the {#link tokenize(Document) other variant}.
741 *
742 * @return A new document object containing the parsed text from the input string.
743 */
744 public Document tokenize(String text) throws IOException {
745 Document document = new Document();
746 document.text = text;
747 tokenize(document);
748
749 return document;
750 }
751
752 public ArrayList<Pair> getTokenPositions() {
753 return this.tokenPositions;
754 }
755
756 public void setProcessor(final Step processor) throws IncompatibleProcessorException {
757 Linkage.link(this, processor);
758 }
759
760 public void close() throws IOException {
761 processor.close();
762 }
763
764 public Class<Document> getInputClass() {
765 return Document.class;
766 }
767
768 public Class<Document> getOutputClass() {
769 return Document.class;
770 }
771 }