View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.parse;
3   
4   import java.io.IOException;
5   import java.util.ArrayList;
6   import java.util.Collections;
7   import java.util.HashMap;
8   import java.util.HashSet;
9   import java.util.Map;
10  import java.util.logging.Level;
11  import java.util.logging.Logger;
12  import org.galagosearch.tupleflow.IncompatibleProcessorException;
13  import org.galagosearch.tupleflow.InputClass;
14  import org.galagosearch.tupleflow.Linkage;
15  import org.galagosearch.tupleflow.NullProcessor;
16  import org.galagosearch.tupleflow.OutputClass;
17  import org.galagosearch.tupleflow.Processor;
18  import org.galagosearch.tupleflow.Source;
19  import org.galagosearch.tupleflow.Step;
20  import org.galagosearch.tupleflow.Utility;
21  import org.galagosearch.tupleflow.execution.Verified;
22  
23  /***
24   * <p>This class processes document text into tokens that can be indexed.</p>
25   * 
26   * <p>The text is assumed to contain some HTML/XML tags.  The tokenizer tries
27   * to extract as much data as possible from each document, even if it is not
28   * well formed (e.g. there are start tags with no ending tags).  The resulting
29   * document object contains an array of terms and an array of tags.</p> 
30   * 
31   * @author trevor
32   */
33  @Verified
34  @InputClass(className = "org.galagosearch.core.parse.Document")
35  @OutputClass(className = "org.galagosearch.core.parse.Document")
36  public class TagTokenizer implements Source<Document>, Processor<Document> {
37      private static final boolean[] splits;
38      private static HashSet<String> ignoredTags;
39      public Processor<Document> processor = new NullProcessor(Document.class);
40      private StringPooler pooler = new StringPooler();
41      private String ignoreUntil;
42      
43  
44      static {
45          splits = buildSplits();
46          ignoredTags = buildIgnoredTags();
47      }
48      private String text;
49      private int position;
50      private int lastSplit;
51      ArrayList<String> tokens;
52      HashMap<String, ArrayList<BeginTag>> openTags;
53      ArrayList<ClosedTag> closedTags;
54      ArrayList<Pair> tokenPositions;
55  
56      public static class Pair {
57          public Pair(int start, int end) {
58              this.start = start;
59              this.end = end;
60          }
61          public int start;
62          public int end;
63  
64          public String toString() {
65              return String.format("%d,%d", start, end);
66          }
67      }
68  
69      private enum StringStatus {
70          Clean,
71          NeedsSimpleFix,
72          NeedsComplexFix,
73          NeedsAcronymProcessing
74      }
75  
76      public TagTokenizer() {
77          text = null;
78          position = 0;
79          lastSplit = -1;
80  
81          tokens = new ArrayList<String>();
82          openTags = new HashMap<String, ArrayList<BeginTag>>();
83          closedTags = new ArrayList<ClosedTag>();
84          tokenPositions = new ArrayList<Pair>();
85      }
86  
87      private static boolean[] buildSplits() {
88          boolean[] localSplits = new boolean[257];
89  
90          for (int i = 0; i < localSplits.length; i++) {
91              localSplits[i] = false;
92          }
93          char[] splitChars = {' ', '\t', '\n', '\r', // spaces
94              ';', '\"', '&', '/', ':', '!', '#',
95              '?', '$', '%', '(', ')', '@', '^',
96              '*', '+', '-', ',', '=', '>', '<', '[',
97              ']', '{', '}', '|', '`', '~', '_'
98          };
99  
100         for (char c : splitChars) {
101             localSplits[(byte) c] = true;
102         }
103 
104         for (byte c = 0; c <= 32; c++) {
105             localSplits[c] = true;
106         }
107 
108         return localSplits;
109     }
110 
111     private static HashSet<String> buildIgnoredTags() {
112         HashSet<String> tags = new HashSet<String>();
113         tags.add("style");
114         tags.add("script");
115         return tags;
116     }
117 
118     class ClosedTag {
119         public ClosedTag(BeginTag begin) {
120             this.name = begin.name;
121             this.attributes = begin.attributes;
122 
123             this.byteStart = begin.bytePosition;
124             this.termStart = begin.termPosition;
125 
126             this.byteEnd = position;
127             this.termEnd = tokens.size();
128         }
129         String name;
130         Map<String, String> attributes;
131         int byteStart;
132         int termStart;
133         int byteEnd;
134         int termEnd;
135     }
136 
137     class BeginTag {
138         public BeginTag(String name, Map<String, String> attributes) {
139             this.name = name;
140             this.attributes = attributes;
141 
142             this.bytePosition = position;
143             this.termPosition = tokens.size();
144         }
145         String name;
146         Map<String, String> attributes;
147         int bytePosition;
148         int termPosition;
149     }
150 
151     /***
152      * Resets parsing in preparation for the next document.
153      */
154     public void reset() {
155         ignoreUntil = null;
156         text = null;
157         position = 0;
158         lastSplit = -1;
159 
160         tokens.clear();
161         openTags.clear();
162         closedTags.clear();
163 
164         if (tokenPositions != null) {
165             tokenPositions.clear();
166         }
167     }
168 
169     private void skipComment() {
170         if (text.substring(position).startsWith("<!--")) {
171             position = text.indexOf("-->", position + 1);
172 
173             if (position >= 0) {
174                 position += 2;
175             }
176         } else {
177             position = text.indexOf(">", position + 1);
178         }
179 
180         if (position < 0) {
181             position = text.length();
182         }
183     }
184 
185     private void skipProcessingInstruction() {
186         position = text.indexOf("?>", position + 1);
187 
188         if (position < 0) {
189             position = text.length();
190         }
191     }
192 
193     private void parseEndTag() {
194         // 1. read name (skipping the </ part)
195         int i;
196 
197         for (i = position + 2; i < text.length(); i++) {
198             char c = text.charAt(i);
199             if (Character.isSpaceChar(c) || c == '>') {
200                 break;
201             }
202         }
203 
204         String tagName = text.substring(position + 2, i).toLowerCase();
205 
206         if (ignoreUntil != null && ignoreUntil.equals(tagName)) {
207             ignoreUntil = null;
208         }
209         if (ignoreUntil == null) {
210             closeTag(tagName);        // advance to end '>'
211         }
212         while (i < text.length() && text.charAt(i) != '>') {
213             i++;
214         }
215         position = i;
216     }
217 
218     private void closeTag(final String tagName) {
219         if (!openTags.containsKey(tagName)) {
220             return;
221         }
222         ArrayList<BeginTag> tagList = openTags.get(tagName);
223 
224         if (tagList.size() > 0) {
225             int last = tagList.size() - 1;
226 
227             BeginTag openTag = tagList.get(last);
228             ClosedTag closedTag = new ClosedTag(openTag);
229             closedTags.add(closedTag);
230 
231             tagList.remove(last);
232         }
233     }
234 
235     private int indexOfNonSpace(int start) {
236         if (start < 0) {
237             return Integer.MIN_VALUE;
238         }
239         for (int i = start; i < text.length(); i++) {
240             char c = text.charAt(i);
241             if (!Character.isSpaceChar(c)) {
242                 return i;
243             }
244         }
245 
246         return Integer.MIN_VALUE;
247     }
248 
249     private int indexOfEndAttribute(int start, int tagEnd) {
250         if (start < 0) {
251             return Integer.MIN_VALUE;        // attribute ends at the first non-quoted space, or
252         // the first '>'.
253         }
254         boolean inQuote = false;
255         boolean lastEscape = false;
256 
257         for (int i = start; i <= tagEnd; i++) {
258             char c = text.charAt(i);
259 
260             if ((c == '\"' || c == '\'') && !lastEscape) {
261                 inQuote = !inQuote;
262                 if (!inQuote) {
263                     return i;
264                 }
265             } else if (!inQuote && (Character.isSpaceChar(c) || c == '>')) {
266                 return i;
267             } else if (c == '//' && !lastEscape) {
268                 lastEscape = true;
269             } else {
270                 lastEscape = false;
271             }
272         }
273 
274         return Integer.MIN_VALUE;
275     }
276 
277     private int indexOfSpace(int start) {
278         if (start < 0) {
279             return Integer.MIN_VALUE;
280         }
281         for (int i = start; i < text.length(); i++) {
282             char c = text.charAt(i);
283             if (Character.isSpaceChar(c)) {
284                 return i;
285             }
286         }
287 
288         return Integer.MIN_VALUE;
289     }
290 
291     private int indexOfEquals(int start, int end) {
292         if (start < 0) {
293             return Integer.MIN_VALUE;
294         }
295         for (int i = start; i < end; i++) {
296             char c = text.charAt(i);
297             if (c == '=') {
298                 return i;
299             }
300         }
301 
302         return Integer.MIN_VALUE;
303     }
304 
305     private void parseBeginTag() {
306         // 1. read the name, skipping the '<'
307         int i;
308 
309         for (i = position + 1; i < text.length(); i++) {
310             char c = text.charAt(i);
311             if (Character.isSpaceChar(c) || c == '>') {
312                 break;
313             }
314         }
315 
316         String tagName = text.substring(position + 1, i).toLowerCase();
317 
318         // 2. read attr pairs
319         i = indexOfNonSpace(i);
320         int tagEnd = text.indexOf(">", i + 1);
321         boolean closeIt = false;
322 
323         HashMap<String, String> attributes = new HashMap<String, String>();
324         while (i < tagEnd && i >= 0 && tagEnd >= 0) {
325             // scan ahead for non space
326             int start = indexOfNonSpace(i);
327 
328             if (start > 0) {
329                 if (text.charAt(start) == '>') {
330                     i = start;
331                     break;
332                 } else if (text.charAt(start) == '/' &&
333                         text.length() > start + 1 &&
334                         text.charAt(start + 1) == '>') {
335                     i = start + 1;
336                     closeIt = true;
337                     break;
338                 }
339             }
340 
341             int end = indexOfEndAttribute(start, tagEnd);
342             int equals = indexOfEquals(start, end);
343 
344             // try to find an equals sign
345             if (equals < 0 || equals == start || end == equals) {
346                 // if there's no equals, try to move to the next thing
347                 if (end < 0) {
348                     i = tagEnd;
349                     break;
350                 } else {
351                     i = end;
352                     continue;
353                 }
354             }
355 
356             // there is an equals, so try to parse the value
357             int startKey = start;
358             int endKey = equals;
359 
360             int startValue = equals + 1;
361             int endValue = end;
362 
363             if (text.charAt(startValue) == '\"' || text.charAt(startValue) == '\'') {
364                 startValue++;
365             }
366             if (startValue >= endValue || startKey >= endKey) {
367                 i = end;
368                 continue;
369             }
370 
371             String key = text.substring(startKey, endKey);
372             String value = text.substring(startValue, endValue);
373 
374             attributes.put(key.toLowerCase(), value);
375 
376             if (end >= text.length()) {
377                 endParsing();
378                 break;
379             }
380 
381             if (text.charAt(end) == '\"' || text.charAt(end) == '\'') {
382                 end++;
383             }
384 
385             i = end;
386         }
387 
388         if (!ignoredTags.contains(tagName)) {
389             BeginTag tag = new BeginTag(tagName, attributes);
390 
391             if (!openTags.containsKey(tagName)) {
392                 ArrayList tagList = new ArrayList();
393                 tagList.add(tag);
394                 openTags.put(tagName, tagList);
395             } else {
396                 openTags.get(tagName).add(tag);
397             }
398 
399             if (closeIt) {
400                 closeTag(tagName);
401             }
402         } else if (!closeIt) {
403             ignoreUntil = tagName;
404         }
405 
406         position = i;
407     }
408 
409     private void endParsing() {
410         position = text.length();
411     }
412 
413     private void onSplit() {
414         if (position - lastSplit > 1) {
415             int start = lastSplit + 1;
416             String token = text.substring(start, position);
417             StringStatus status = checkTokenStatus(token);
418 
419             switch (status) {
420                 case NeedsSimpleFix:
421                     token = tokenSimpleFix(token);
422                     break;
423 
424                 case NeedsComplexFix:
425                     token = tokenComplexFix(token);
426                     break;
427 
428                 case NeedsAcronymProcessing:
429                     tokenAcronymProcessing(token, start, position);
430                     break;
431 
432                 case Clean:
433                     // do nothing
434                     break;
435             }
436 
437             if (status != StringStatus.NeedsAcronymProcessing) {
438                 addToken(token, start, position);
439             }
440         }
441 
442         lastSplit = position;
443     }
444 
445     /***
446      * Adds a token to the document object.  This method currently drops tokens 
447      * longer than 100 bytes long right now.
448      * 
449      * @param token  The token to add.
450      * @param start  The starting byte offset of the token in the document text.
451      * @param end    The ending byte offset of the token in the document text.
452      */
453     private void addToken(final String token, int start, int end) {
454         final int maxTokenLength = 100;
455         // zero length tokens aren't interesting
456         if (token.length() <= 0) {
457             return;
458         }
459         // we want to make sure the token is short enough that someone
460         // might actually type it.  UTF-8 can expand one character to 6 bytes.
461         if (token.length() > maxTokenLength / 6 &&
462             Utility.makeBytes(token).length >= maxTokenLength) {
463             return;
464         }
465         tokens.add(token);
466         tokenPositions.add(new Pair(start, end));
467     }
468 
469     private String tokenComplexFix(String token) {
470         token = tokenSimpleFix(token);
471         token = token.toLowerCase();
472 
473         return token;
474     }
475 
476     /***
477      * This method does three kinds of processing:
478      * <ul>
479      *  <li>If the token contains periods at the beginning or the end,
480      *      they are removed.</li>
481      *  <li>If the token contains single letters followed by periods, such
482      *      as I.B.M., C.I.A., or U.S.A., the periods are removed.</li>
483      *  <li>If, instead, the token contains longer strings of text with
484      *      periods in the middle, the token is split into 
485      *      smaller tokens ("umass.edu" becomes {"umass", "edu"}).  Notice
486      *      that this means ("ph.d." becomes {"ph", "d"}).</li>
487      * </ul>
488      * 
489      * @param token
490      * @param start
491      * @param end
492      */
493     private void tokenAcronymProcessing(String token, int start, int end) {
494         token = tokenComplexFix(token);
495 
496         // remove start and ending periods
497         while (token.startsWith(".")) {
498             token = token.substring(1);
499             start = start + 1;
500         }
501 
502         while (token.endsWith(".")) {
503             token = token.substring(0, token.length() - 1);
504             end -= 1;
505         }
506 
507         // does the token have any periods left?
508         if (token.indexOf('.') >= 0) {
509             // is this an acronym?  then there will be periods
510             // at odd positions:
511             boolean isAcronym = token.length() > 0;
512             for (int pos = 1; pos < token.length(); pos += 2) {
513                 if (token.charAt(pos) != '.') {
514                     isAcronym = false;
515                 }
516             }
517 
518             if (isAcronym) {
519                 token = token.replace(".", "");
520                 addToken(token, start, end);
521             } else {
522                 int s = 0;
523                 for (int e = 0; e < token.length(); e++) {
524                     if (token.charAt(e) == '.') {
525                         if (e - s > 1) {
526                             String subtoken = token.substring(s, e);
527                             addToken(subtoken, start + s, start + e);
528                         }
529                         s = e + 1;
530                     }
531                 }
532 
533                 if (token.length() - s > 1) {
534                     String subtoken = token.substring(s);
535                     addToken(subtoken, start + s, end);
536                 }
537             }
538         } else {
539             addToken(token, start, end);
540         }
541     }
542 
543     /***
544      * Scans through the token, removing apostrophes and converting
545      * uppercase to lowercase letters.
546      * 
547      * @param token
548      * @return
549      */
550     private String tokenSimpleFix(String token) {
551         char[] chars = token.toCharArray();
552         int j = 0;
553 
554         for (int i = 0; i < chars.length; i++) {
555             char c = chars[i];
556             boolean isAsciiUppercase = (c >= 'A' && c <= 'Z');
557             boolean isApostrophe = (c == '\'');
558 
559             if (isAsciiUppercase) {
560                 chars[j] = (char) (chars[i] + 'a' - 'A');
561             } else if (isApostrophe) {
562                 // it's an apostrophe, skip it
563                 j--;
564             } else {
565                 chars[j] = chars[i];
566             }
567 
568             j++;
569         }
570 
571         token = new String(chars, 0, j);
572         return token;
573     }
574 
575     /***
576      * This method scans the token, looking for uppercase characters and
577      * special characters.  If the token contains only numbers and lowercase
578      * letters, it needs no further processing, and it returns Clean.
579      * If it also contains uppercase letters or apostrophes, it returns 
580      * NeedsSimpleFix.  If it contains special characters (especially Unicode
581      * characters), it returns NeedsComplexFix.  Finally, if any periods are
582      * present, this returns NeedsAcronymProcessing.
583      * 
584      * @param token
585      * @return
586      */
587     private StringStatus checkTokenStatus(final String token) {
588         StringStatus status = StringStatus.Clean;
589         char[] chars = token.toCharArray();
590 
591         for (int i = 0; i < chars.length; i++) {
592             char c = chars[i];
593             boolean isAsciiLowercase = (c >= 'a' && c <= 'z');
594             boolean isAsciiNumber = (c >= '0' && c <= '9');
595 
596             if (isAsciiLowercase || isAsciiNumber) {
597                 continue;
598             }
599             boolean isAsciiUppercase = (c >= 'A' && c <= 'Z');
600             boolean isPeriod = (c == '.');
601             boolean isApostrophe = (c == '\'');
602 
603             if ((isAsciiUppercase || isApostrophe) && status == StringStatus.Clean) {
604                 status = StringStatus.NeedsSimpleFix;
605             } else if (!isPeriod) {
606                 status = StringStatus.NeedsComplexFix;
607             } else {
608                 status = StringStatus.NeedsAcronymProcessing;
609                 break;
610             }
611         }
612 
613         return status;
614     }
615 
616     private void onStartBracket() {
617         if (position + 1 < text.length()) {
618             char c = text.charAt(position + 1);
619 
620             if (c == '/') {
621                 parseEndTag();
622             } else if (c == '!') {
623                 skipComment();
624             } else if (c == '?') {
625                 skipProcessingInstruction();
626             } else {
627                 parseBeginTag();
628             }
629         } else {
630             endParsing();
631         }
632 
633         lastSplit = position;
634     }
635 
636     /***
637      * Translates tags from the internal ClosedTag format to the
638      * Tag type.
639      */
640     private ArrayList<Tag> coalesceTags() {
641         ArrayList<Tag> result = new ArrayList();
642 
643         // close all open tags
644         for (ArrayList<BeginTag> tagList : openTags.values()) {
645             for (BeginTag tag : tagList) {
646                 result.add(new Tag(tag.name, tag.attributes, tag.termPosition, tag.termPosition));
647             }
648         }
649 
650         for (ClosedTag tag : closedTags) {
651             result.add(new Tag(tag.name, tag.attributes, tag.termStart, tag.termEnd));
652         }
653 
654         Collections.sort(result);
655         return result;
656     }
657 
658     public void onAmpersand() {
659         onSplit();
660 
661         for (int i = position + 1; i < text.length(); i++) {
662             char c = text.charAt(i);
663 
664             if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9' || c == '#') {
665                 continue;
666             }
667             if (c == ';') {
668                 position = i;
669                 lastSplit = i;
670                 return;
671             }
672 
673             // not a valid escape sequence
674             break;
675         }
676     }
677 
678     /***
679      * Parses the text in the document.text attribute and fills in the 
680      * document.terms and document.tags arrays, then passes that document
681      * to the next processing stage.
682      * 
683      * @param document
684      * @throws java.io.IOException
685      */
686     public void process(Document document) throws IOException {
687         tokenize(document);
688         processor.process(document);
689     }
690 
691     /***
692      * Parses the text in the document.text attribute and fills in the 
693      * document.terms and document.tags arrays.
694      * 
695      * @param document
696      * @throws java.io.IOException
697      */
698     public void tokenize(Document document) {
699         reset();
700         text = document.text;
701 
702         try {
703             // this loop is looking for tags, split characters, and XML escapes,
704             // which start with ampersands.  All other characters are assumed to
705             // be word characters.  The onSplit() method takes care of extracting
706             // word text and storing it in the terms array.  The onStartBracket
707             // method parses tags.  ignoreUntil is used to ignore comments and
708             // script data.
709             for (; position >= 0 && position < text.length(); position++) {
710                 char c = text.charAt(position);
711 
712                 if (c == '<') {
713                     if (ignoreUntil == null) {
714                         onSplit();
715                     }
716                     onStartBracket();
717                 } else if (ignoreUntil != null) {
718                     continue;
719                 } else if (c == '&') {
720                     onAmpersand();
721                 } else if (c < 256 && splits[c]) {
722                     onSplit();
723                 }
724             }
725         } catch (Exception e) {
726             Logger.getLogger(getClass().toString()).log(Level.WARNING,
727                                                         "Parse failure: " + document.identifier);
728         }
729 
730         if (ignoreUntil == null) {
731             onSplit();
732         }
733         document.terms = new ArrayList<String>(this.tokens);
734         document.tags = coalesceTags();
735         pooler.transform(document);
736     }
737 
738     /***
739      * Parses the text in the input string and returns a document object.
740      * This method calls the {#link tokenize(Document) other variant}.
741      * 
742      * @return A new document object containing the parsed text from the input string.
743      */
744     public Document tokenize(String text) throws IOException {
745         Document document = new Document();
746         document.text = text;
747         tokenize(document);
748 
749         return document;
750     }
751 
752     public ArrayList<Pair> getTokenPositions() {
753         return this.tokenPositions;
754     }
755 
756     public void setProcessor(final Step processor) throws IncompatibleProcessorException {
757         Linkage.link(this, processor);
758     }
759 
760     public void close() throws IOException {
761         processor.close();
762     }
763 
764     public Class<Document> getInputClass() {
765         return Document.class;
766     }
767 
768     public Class<Document> getOutputClass() {
769         return Document.class;
770     }
771 }