View Javadoc

1   // BSD License (http://www.galagosearch.org/license)
2   package org.galagosearch.core.index;
3   
4   import java.io.DataOutputStream;
5   import java.io.FileNotFoundException;
6   import java.io.IOException;
7   import java.io.OutputStream;
8   import java.util.TreeMap;
9   import org.galagosearch.core.types.NumberWordPosition;
10  import org.galagosearch.tupleflow.InputClass;
11  import org.galagosearch.tupleflow.TupleFlowParameters;
12  import org.galagosearch.tupleflow.Utility;
13  import org.galagosearch.tupleflow.execution.ErrorHandler;
14  import org.galagosearch.tupleflow.execution.Verification;
15  
16  /***
17   *
18   * @author trevor
19   */
20  @InputClass(className = "org.galagosearch.core.types.NumberWordPosition", order = {"+word", "+document", "+position"})
21  public class PositionIndexWriter implements
22          NumberWordPosition.WordDocumentPositionOrder.ShreddedProcessor {
23      int blockSize = 32768;
24      byte[] lastWord;
25      long lastPosition = 0;
26      long lastDocument = 0;
27      int skipMinimumBinLength;
28      TreeMap<Integer, Integer> skipLengths;
29  
30      public class PositionsList implements IndexElement {
31          public PositionsList() {
32              documents = new BackedCompressedByteBuffer();
33              counts = new BackedCompressedByteBuffer();
34              positions = new BackedCompressedByteBuffer();
35              header = new BackedCompressedByteBuffer();
36          }
37  
38          public void close() throws IOException {
39              int options = 0;
40  
41              if (documents.length() > 0) {
42                  counts.add(positionCount);
43              }
44              header.add(options);
45  
46              header.add(documentCount);
47              header.add(totalPositionCount);
48  
49              header.add(documents.length());
50              header.add(counts.length());
51              header.add(positions.length());
52          }
53  
54          public long dataLength() {
55              long listLength = 0;
56  
57              listLength += header.length();
58              listLength += counts.length();
59              listLength += positions.length();
60              listLength += documents.length();
61  
62              return listLength;
63          }
64  
65          public void write(final OutputStream output) throws IOException {
66              header.write(output);
67              header.clear();
68  
69              documents.write(output);
70              documents.clear();
71  
72              counts.write(output);
73              counts.clear();
74  
75              positions.write(output);
76              positions.clear();
77          }
78  
79          public byte[] key() {
80              return word;
81          }
82  
83          public void setWord(byte[] word) {
84              this.word = word;
85              this.lastDocument = 0;
86              this.lastPosition = 0;
87              this.totalPositionCount = 0;
88              this.positionCount = 0;
89          }
90  
91          public void addDocument(long documentID) throws IOException {
92              // add the last document's counts
93              if (documents.length() > 0) {
94                  counts.add(positionCount);
95              }
96              documents.add(documentID - lastDocument);
97              lastDocument = documentID;
98  
99              lastPosition = 0;
100             positionCount = 0;
101             documentCount++;
102         }
103 
104         public void addPosition(int position) throws IOException {
105             positionCount++;
106             totalPositionCount++;
107             positions.add(position - lastPosition);
108             lastPosition = position;
109         }
110         private long lastDocument;
111         private int lastPosition;
112         private int positionCount;
113         private int documentCount;
114         private int totalPositionCount;
115         public byte[] word;
116         public BackedCompressedByteBuffer header;
117         public BackedCompressedByteBuffer documents;
118         public BackedCompressedByteBuffer counts;
119         public BackedCompressedByteBuffer positions;
120     }
121     long maximumDocumentCount = 0;
122     long maximumDocumentNumber = 0;
123     PositionsList invertedList;
124     DataOutputStream output;
125     long filePosition;
126     IndexWriter writer;
127     long documentCount = 0;
128     long collectionLength = 0;
129 
130     /***
131      * Creates a new instance of BinnedListWriter
132      */
133     public PositionIndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
134         writer = new IndexWriter(parameters);
135         writer.getManifest().add("writerClass", getClass().getName());
136         writer.getManifest().add("readerClass", PositionIndexReader.class.getName());
137     }
138 
139     public void processWord(byte[] wordBytes) throws IOException {
140         if (invertedList != null) {
141             invertedList.close();
142             writer.add(invertedList);
143             invertedList = null;
144         }
145 
146         resetDocumentCount();
147 
148         invertedList = new PositionsList();
149         invertedList.setWord(wordBytes);
150 
151         assert lastWord == null || 0 != Utility.compare(lastWord, wordBytes) : "Duplicate word";
152         lastWord = wordBytes;
153     }
154 
155     public void processDocument(int document) throws IOException {
156         invertedList.addDocument(document);
157         documentCount++;
158         maximumDocumentNumber = Math.max(document, maximumDocumentNumber);
159         lastDocument = document;
160     }
161 
162     public void processPosition(int position) throws IOException {
163         invertedList.addPosition(position);
164     }
165 
166     public void processTuple() {
167         // does nothing
168     }
169 
170     private void resetDocumentCount() {
171         maximumDocumentCount = Math.max(documentCount, maximumDocumentCount);
172         documentCount = 0;
173     }
174 
175     public void close() throws IOException {
176         if (invertedList != null) {
177             invertedList.close();
178             writer.add(invertedList);
179         }
180 
181         writer.close();
182     }
183 
184     public long documentCount() {
185         return maximumDocumentNumber;
186     }
187 
188     public long maximumDocumentCount() {
189         return maximumDocumentCount;
190     }
191 
192     public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
193         if (!parameters.getXML().containsKey("filename")) {
194             handler.addError("PositionsListWriter requires an 'filename' parameter.");
195             return;
196         }
197 
198         String index = parameters.getXML().get("filename");
199         Verification.requireWriteableFile(index, handler);
200     }
201 }
202