1
2 package org.galagosearch.core.index;
3
4 import java.io.DataOutputStream;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.OutputStream;
8 import java.util.TreeMap;
9 import org.galagosearch.core.types.NumberWordPosition;
10 import org.galagosearch.tupleflow.InputClass;
11 import org.galagosearch.tupleflow.TupleFlowParameters;
12 import org.galagosearch.tupleflow.Utility;
13 import org.galagosearch.tupleflow.execution.ErrorHandler;
14 import org.galagosearch.tupleflow.execution.Verification;
15
16 /***
17 *
18 * @author trevor
19 */
20 @InputClass(className = "org.galagosearch.core.types.NumberWordPosition", order = {"+word", "+document", "+position"})
21 public class PositionIndexWriter implements
22 NumberWordPosition.WordDocumentPositionOrder.ShreddedProcessor {
23 int blockSize = 32768;
24 byte[] lastWord;
25 long lastPosition = 0;
26 long lastDocument = 0;
27 int skipMinimumBinLength;
28 TreeMap<Integer, Integer> skipLengths;
29
30 public class PositionsList implements IndexElement {
31 public PositionsList() {
32 documents = new BackedCompressedByteBuffer();
33 counts = new BackedCompressedByteBuffer();
34 positions = new BackedCompressedByteBuffer();
35 header = new BackedCompressedByteBuffer();
36 }
37
38 public void close() throws IOException {
39 int options = 0;
40
41 if (documents.length() > 0) {
42 counts.add(positionCount);
43 }
44 header.add(options);
45
46 header.add(documentCount);
47 header.add(totalPositionCount);
48
49 header.add(documents.length());
50 header.add(counts.length());
51 header.add(positions.length());
52 }
53
54 public long dataLength() {
55 long listLength = 0;
56
57 listLength += header.length();
58 listLength += counts.length();
59 listLength += positions.length();
60 listLength += documents.length();
61
62 return listLength;
63 }
64
65 public void write(final OutputStream output) throws IOException {
66 header.write(output);
67 header.clear();
68
69 documents.write(output);
70 documents.clear();
71
72 counts.write(output);
73 counts.clear();
74
75 positions.write(output);
76 positions.clear();
77 }
78
79 public byte[] key() {
80 return word;
81 }
82
83 public void setWord(byte[] word) {
84 this.word = word;
85 this.lastDocument = 0;
86 this.lastPosition = 0;
87 this.totalPositionCount = 0;
88 this.positionCount = 0;
89 }
90
91 public void addDocument(long documentID) throws IOException {
92
93 if (documents.length() > 0) {
94 counts.add(positionCount);
95 }
96 documents.add(documentID - lastDocument);
97 lastDocument = documentID;
98
99 lastPosition = 0;
100 positionCount = 0;
101 documentCount++;
102 }
103
104 public void addPosition(int position) throws IOException {
105 positionCount++;
106 totalPositionCount++;
107 positions.add(position - lastPosition);
108 lastPosition = position;
109 }
110 private long lastDocument;
111 private int lastPosition;
112 private int positionCount;
113 private int documentCount;
114 private int totalPositionCount;
115 public byte[] word;
116 public BackedCompressedByteBuffer header;
117 public BackedCompressedByteBuffer documents;
118 public BackedCompressedByteBuffer counts;
119 public BackedCompressedByteBuffer positions;
120 }
121 long maximumDocumentCount = 0;
122 long maximumDocumentNumber = 0;
123 PositionsList invertedList;
124 DataOutputStream output;
125 long filePosition;
126 IndexWriter writer;
127 long documentCount = 0;
128 long collectionLength = 0;
129
130 /***
131 * Creates a new instance of BinnedListWriter
132 */
133 public PositionIndexWriter(TupleFlowParameters parameters) throws FileNotFoundException, IOException {
134 writer = new IndexWriter(parameters);
135 writer.getManifest().add("writerClass", getClass().getName());
136 writer.getManifest().add("readerClass", PositionIndexReader.class.getName());
137 }
138
139 public void processWord(byte[] wordBytes) throws IOException {
140 if (invertedList != null) {
141 invertedList.close();
142 writer.add(invertedList);
143 invertedList = null;
144 }
145
146 resetDocumentCount();
147
148 invertedList = new PositionsList();
149 invertedList.setWord(wordBytes);
150
151 assert lastWord == null || 0 != Utility.compare(lastWord, wordBytes) : "Duplicate word";
152 lastWord = wordBytes;
153 }
154
155 public void processDocument(int document) throws IOException {
156 invertedList.addDocument(document);
157 documentCount++;
158 maximumDocumentNumber = Math.max(document, maximumDocumentNumber);
159 lastDocument = document;
160 }
161
162 public void processPosition(int position) throws IOException {
163 invertedList.addPosition(position);
164 }
165
166 public void processTuple() {
167
168 }
169
170 private void resetDocumentCount() {
171 maximumDocumentCount = Math.max(documentCount, maximumDocumentCount);
172 documentCount = 0;
173 }
174
175 public void close() throws IOException {
176 if (invertedList != null) {
177 invertedList.close();
178 writer.add(invertedList);
179 }
180
181 writer.close();
182 }
183
184 public long documentCount() {
185 return maximumDocumentNumber;
186 }
187
188 public long maximumDocumentCount() {
189 return maximumDocumentCount;
190 }
191
192 public static void verify(TupleFlowParameters parameters, ErrorHandler handler) {
193 if (!parameters.getXML().containsKey("filename")) {
194 handler.addError("PositionsListWriter requires an 'filename' parameter.");
195 return;
196 }
197
198 String index = parameters.getXML().get("filename");
199 Verification.requireWriteableFile(index, handler);
200 }
201 }
202