View Javadoc

1   package org.apache.maven.index.updater;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0    
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.BufferedInputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.UTFDataFormatException;
29  import java.util.Date;
30  import java.util.zip.GZIPInputStream;
31  
32  import org.apache.lucene.document.Document;
33  import org.apache.lucene.document.Field;
34  import org.apache.lucene.document.Field.Index;
35  import org.apache.lucene.document.Field.Store;
36  import org.apache.lucene.index.IndexWriter;
37  import org.apache.maven.index.context.IndexUtils;
38  import org.apache.maven.index.context.IndexingContext;
39  
40  /**
41   * An index data reader used to parse transfer index format.
42   * 
43   * @author Eugene Kuleshov
44   */
45  public class IndexDataReader
46  {
47      private final DataInputStream dis;
48  
49      public IndexDataReader( InputStream is )
50          throws IOException
51      {
52          BufferedInputStream bis = new BufferedInputStream( is, 1024 * 8 );
53  
54          // MINDEXER-13
55          // LightweightHttpWagon may have performed automatic decompression
56          // Handle it transparently
57          bis.mark( 2 );
58          InputStream data;
59          if ( bis.read() == 0x1f && bis.read() == 0x8b ) // GZIPInputStream.GZIP_MAGIC
60          {
61              bis.reset();
62              data = new GZIPInputStream( bis, 2 * 1024 );
63          }
64          else
65          {
66              bis.reset();
67              data = bis;
68          }
69  
70          this.dis = new DataInputStream( data );
71      }
72  
73      public IndexDataReadResult readIndex( IndexWriter w, IndexingContext context )
74          throws IOException
75      {
76          long timestamp = readHeader();
77  
78          Date date = null;
79  
80          if ( timestamp != -1 )
81          {
82              date = new Date( timestamp );
83  
84              IndexUtils.updateTimestamp( w.getDirectory(), date );
85          }
86  
87          int n = 0;
88  
89          Document doc;
90          while ( ( doc = readDocument() ) != null )
91          {
92              w.addDocument( IndexUtils.updateDocument( doc, context, false ) );
93  
94              n++;
95          }
96  
97          w.commit();
98          w.optimize();
99  
100         IndexDataReadResult result = new IndexDataReadResult();
101         result.setDocumentCount( n );
102         result.setTimestamp( date );
103         return result;
104     }
105 
106     public long readHeader()
107         throws IOException
108     {
109         final byte HDRBYTE = (byte) ( ( IndexDataWriter.VERSION << 24 ) >> 24 );
110 
111         if ( HDRBYTE != dis.readByte() )
112         {
113             // data format version mismatch
114             throw new IOException( "Provided input contains unexpected data (0x01 expected as 1st byte)!" );
115         }
116 
117         return dis.readLong();
118     }
119 
120     public Document readDocument()
121         throws IOException
122     {
123         int fieldCount;
124         try
125         {
126             fieldCount = dis.readInt();
127         }
128         catch ( EOFException ex )
129         {
130             return null; // no more documents
131         }
132 
133         Document doc = new Document();
134 
135         for ( int i = 0; i < fieldCount; i++ )
136         {
137             doc.add( readField() );
138         }
139 
140         return doc;
141     }
142 
143     private Field readField()
144         throws IOException
145     {
146         int flags = dis.read();
147 
148         Index index = Index.NO;
149         if ( ( flags & IndexDataWriter.F_INDEXED ) > 0 )
150         {
151             boolean isTokenized = ( flags & IndexDataWriter.F_TOKENIZED ) > 0;
152             index = isTokenized ? Index.ANALYZED : Index.NOT_ANALYZED;
153         }
154 
155         Store store = Store.NO;
156         if ( ( flags & IndexDataWriter.F_STORED ) > 0 )
157         {
158             store = Store.YES;
159         }
160 
161         String name = dis.readUTF();
162         String value = readUTF( dis );
163 
164         return new Field( name, value, store, index );
165     }
166 
167     private static String readUTF( DataInput in )
168         throws IOException
169     {
170         int utflen = in.readInt();
171 
172         byte[] bytearr;
173         char[] chararr;
174 
175         try
176         {
177             bytearr = new byte[utflen];
178             chararr = new char[utflen];
179         }
180         catch ( OutOfMemoryError e )
181         {
182             final IOException ex =
183                 new IOException(
184                     "Index data content is inappropriate (is junk?), leads to OutOfMemoryError! See MINDEXER-28 for more information!" );
185             ex.initCause( e );
186             throw ex;
187         }
188 
189         int c, char2, char3;
190         int count = 0;
191         int chararr_count = 0;
192 
193         in.readFully( bytearr, 0, utflen );
194 
195         while ( count < utflen )
196         {
197             c = bytearr[count] & 0xff;
198             if ( c > 127 )
199             {
200                 break;
201             }
202             count++;
203             chararr[chararr_count++] = (char) c;
204         }
205 
206         while ( count < utflen )
207         {
208             c = bytearr[count] & 0xff;
209             switch ( c >> 4 )
210             {
211                 case 0:
212                 case 1:
213                 case 2:
214                 case 3:
215                 case 4:
216                 case 5:
217                 case 6:
218                 case 7:
219                     /* 0xxxxxxx */
220                     count++;
221                     chararr[chararr_count++] = (char) c;
222                     break;
223 
224                 case 12:
225                 case 13:
226                     /* 110x xxxx 10xx xxxx */
227                     count += 2;
228                     if ( count > utflen )
229                     {
230                         throw new UTFDataFormatException( "malformed input: partial character at end" );
231                     }
232                     char2 = bytearr[count - 1];
233                     if ( ( char2 & 0xC0 ) != 0x80 )
234                     {
235                         throw new UTFDataFormatException( "malformed input around byte " + count );
236                     }
237                     chararr[chararr_count++] = (char) ( ( ( c & 0x1F ) << 6 ) | ( char2 & 0x3F ) );
238                     break;
239 
240                 case 14:
241                     /* 1110 xxxx 10xx xxxx 10xx xxxx */
242                     count += 3;
243                     if ( count > utflen )
244                     {
245                         throw new UTFDataFormatException( "malformed input: partial character at end" );
246                     }
247                     char2 = bytearr[count - 2];
248                     char3 = bytearr[count - 1];
249                     if ( ( ( char2 & 0xC0 ) != 0x80 ) || ( ( char3 & 0xC0 ) != 0x80 ) )
250                     {
251                         throw new UTFDataFormatException( "malformed input around byte " + ( count - 1 ) );
252                     }
253                     chararr[chararr_count++] =
254                         (char) ( ( ( c & 0x0F ) << 12 ) | ( ( char2 & 0x3F ) << 6 ) | ( ( char3 & 0x3F ) << 0 ) );
255                     break;
256 
257                 default:
258                     /* 10xx xxxx, 1111 xxxx */
259                     throw new UTFDataFormatException( "malformed input around byte " + count );
260             }
261         }
262 
263         // The number of chars produced may be less than utflen
264         return new String( chararr, 0, chararr_count );
265     }
266 
267     /**
268      * An index data read result holder
269      */
270     public static class IndexDataReadResult
271     {
272         private Date timestamp;
273 
274         private int documentCount;
275 
276         public void setDocumentCount( int documentCount )
277         {
278             this.documentCount = documentCount;
279         }
280 
281         public int getDocumentCount()
282         {
283             return documentCount;
284         }
285 
286         public void setTimestamp( Date timestamp )
287         {
288             this.timestamp = timestamp;
289         }
290 
291         public Date getTimestamp()
292         {
293             return timestamp;
294         }
295 
296     }
297 
298     /**
299      * Reads index content by using a visitor. <br>
300      * The visitor is called for each read documents after it has been populated with Lucene fields.
301      * 
302      * @param visitor an index data visitor
303      * @param context indexing context
304      * @return statistics about read data
305      * @throws IOException in case of an IO exception during index file access
306      */
307     public IndexDataReadResult readIndex( final IndexDataReadVisitor visitor, final IndexingContext context )
308         throws IOException
309     {
310         dis.readByte(); // data format version
311 
312         long timestamp = dis.readLong();
313 
314         Date date = null;
315 
316         if ( timestamp != -1 )
317         {
318             date = new Date( timestamp );
319         }
320 
321         int n = 0;
322 
323         Document doc;
324         while ( ( doc = readDocument() ) != null )
325         {
326             visitor.visitDocument( IndexUtils.updateDocument( doc, context, false ) );
327 
328             n++;
329         }
330 
331         IndexDataReadResult result = new IndexDataReadResult();
332         result.setDocumentCount( n );
333         result.setTimestamp( date );
334         return result;
335     }
336 
337     /**
338      * Visitor of indexed Lucene documents.
339      */
340     public static interface IndexDataReadVisitor
341     {
342 
343         /**
344          * Called on each read document. The document is already populated with fields.
345          * 
346          * @param document read document
347          */
348         void visitDocument( Document document );
349 
350     }
351 
352 }