View Javadoc
1   package org.apache.maven.index.updater;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.BufferedInputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.UTFDataFormatException;
29  import java.util.Date;
30  import java.util.LinkedHashSet;
31  import java.util.Set;
32  import java.util.zip.GZIPInputStream;
33  
34  import org.apache.lucene.document.Document;
35  import org.apache.lucene.document.Field;
36  import org.apache.lucene.document.FieldType;
37  import org.apache.lucene.index.IndexOptions;
38  import org.apache.lucene.index.IndexWriter;
39  import org.apache.maven.index.ArtifactInfo;
40  import org.apache.maven.index.context.IndexUtils;
41  import org.apache.maven.index.context.IndexingContext;
42  
43  /**
44   * An index data reader used to parse transfer index format.
45   *
46   * @author Eugene Kuleshov
47   */
48  public class IndexDataReader
49  {
50      private final DataInputStream dis;
51  
52      public IndexDataReader( final InputStream is )
53          throws IOException
54      {
55          // MINDEXER-13
56          // LightweightHttpWagon may have performed automatic decompression
57          // Handle it transparently
58          is.mark( 2 );
59          InputStream data;
60          if ( is.read() == 0x1f && is.read() == 0x8b ) // GZIPInputStream.GZIP_MAGIC
61          {
62              is.reset();
63              data = new BufferedInputStream( new GZIPInputStream( is, 1024 * 8 ), 1024 * 8 );
64          }
65          else
66          {
67              is.reset();
68              data = new BufferedInputStream( is, 1024 * 8 );
69          }
70  
71          this.dis = new DataInputStream( data );
72      }
73  
74      public IndexDataReadResult readIndex( IndexWriter w, IndexingContext context )
75          throws IOException
76      {
77          long timestamp = readHeader();
78  
79          Date date = null;
80  
81          if ( timestamp != -1 )
82          {
83              date = new Date( timestamp );
84  
85              IndexUtils.updateTimestamp( w.getDirectory(), date );
86          }
87  
88          int n = 0;
89  
90          Document doc;
91          Set<String> rootGroups = new LinkedHashSet<>();
92          Set<String> allGroups = new LinkedHashSet<>();
93  
94          while ( ( doc = readDocument() ) != null )
95          {
96              ArtifactInfo ai = IndexUtils.constructArtifactInfo( doc, context );
97              if ( ai != null )
98              {
99                  w.addDocument( IndexUtils.updateDocument( doc, context, false, ai ) );
100 
101                 rootGroups.add( ai.getRootGroup() );
102                 allGroups.add( ai.getGroupId() );
103             }
104             else if ( doc.getField( ArtifactInfo.ALL_GROUPS ) != null
105                     || doc.getField( ArtifactInfo.ROOT_GROUPS ) != null )
106             {
107                 // skip it
108             }
109             else
110             {
111                 w.addDocument( doc );
112             }
113             n++;
114         }
115 
116         w.commit();
117 
118         IndexDataReadResult result = new IndexDataReadResult();
119         result.setDocumentCount( n );
120         result.setTimestamp( date );
121         result.setRootGroups( rootGroups );
122         result.setAllGroups( allGroups );
123 
124         return result;
125     }
126 
127     public long readHeader()
128         throws IOException
129     {
130         final byte hdrbyte = (byte) ( ( IndexDataWriter.VERSION << 24 ) >> 24 );
131 
132         if ( hdrbyte != dis.readByte() )
133         {
134             // data format version mismatch
135             throw new IOException( "Provided input contains unexpected data (0x01 expected as 1st byte)!" );
136         }
137 
138         return dis.readLong();
139     }
140 
141     public Document readDocument()
142         throws IOException
143     {
144         int fieldCount;
145         try
146         {
147             fieldCount = dis.readInt();
148         }
149         catch ( EOFException ex )
150         {
151             return null; // no more documents
152         }
153 
154         Document doc = new Document();
155 
156         for ( int i = 0; i < fieldCount; i++ )
157         {
158             doc.add( readField() );
159         }
160 
161         // Fix up UINFO field wrt MINDEXER-41
162         final Field uinfoField = (Field) doc.getField( ArtifactInfo.UINFO );
163         final String info =  doc.get( ArtifactInfo.INFO );
164         if ( uinfoField != null && info != null && !info.isEmpty() )
165         {
166             final String[] splitInfo = ArtifactInfo.FS_PATTERN.split( info );
167             if ( splitInfo.length > 6 )
168             {
169                 final String extension = splitInfo[6];
170                 final String uinfoString = uinfoField.stringValue();
171                 if ( uinfoString.endsWith( ArtifactInfo.FS + ArtifactInfo.NA ) )
172                 {
173                     uinfoField.setStringValue( uinfoString + ArtifactInfo.FS + ArtifactInfo.nvl( extension ) );
174                 }
175             }
176         }
177 
178         return doc;
179     }
180 
181     private Field readField()
182         throws IOException
183     {
184         int flags = dis.read();
185 
186         FieldType fieldType = new FieldType();
187         if ( ( flags & IndexDataWriter.F_INDEXED ) > 0 )
188         {
189             boolean tokenized = ( flags & IndexDataWriter.F_TOKENIZED ) > 0;
190             fieldType.setTokenized( tokenized );
191         }
192         fieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
193         fieldType.setStored( ( flags & IndexDataWriter.F_STORED ) > 0 );
194 
195         String name = dis.readUTF();
196         String value = readUTF( dis );
197 
198         return new Field( name, value, fieldType );
199     }
200 
201     private static String readUTF( DataInput in )
202         throws IOException
203     {
204         int utflen = in.readInt();
205 
206         byte[] bytearr;
207         char[] chararr;
208 
209         try
210         {
211             bytearr = new byte[utflen];
212             chararr = new char[utflen];
213         }
214         catch ( OutOfMemoryError e )
215         {
216             throw new IOException( "Index data content is inappropriate (is junk?), leads to OutOfMemoryError!"
217                 + " See MINDEXER-28 for more information!", e );
218         }
219 
220         int c, char2, char3;
221         int count = 0;
222         int chararrCount = 0;
223 
224         in.readFully( bytearr, 0, utflen );
225 
226         while ( count < utflen )
227         {
228             c = bytearr[count] & 0xff;
229             if ( c > 127 )
230             {
231                 break;
232             }
233             count++;
234             chararr[chararrCount++] = (char) c;
235         }
236 
237         while ( count < utflen )
238         {
239             c = bytearr[count] & 0xff;
240             switch ( c >> 4 )
241             {
242                 case 0:
243                 case 1:
244                 case 2:
245                 case 3:
246                 case 4:
247                 case 5:
248                 case 6:
249                 case 7:
250                     /* 0xxxxxxx */
251                     count++;
252                     chararr[chararrCount++] = (char) c;
253                     break;
254 
255                 case 12:
256                 case 13:
257                     /* 110x xxxx 10xx xxxx */
258                     count += 2;
259                     if ( count > utflen )
260                     {
261                         throw new UTFDataFormatException( "malformed input: partial character at end" );
262                     }
263                     char2 = bytearr[count - 1];
264                     if ( ( char2 & 0xC0 ) != 0x80 )
265                     {
266                         throw new UTFDataFormatException( "malformed input around byte " + count );
267                     }
268                     chararr[chararrCount++] = (char) ( ( ( c & 0x1F ) << 6 ) | ( char2 & 0x3F ) );
269                     break;
270 
271                 case 14:
272                     /* 1110 xxxx 10xx xxxx 10xx xxxx */
273                     count += 3;
274                     if ( count > utflen )
275                     {
276                         throw new UTFDataFormatException( "malformed input: partial character at end" );
277                     }
278                     char2 = bytearr[count - 2];
279                     char3 = bytearr[count - 1];
280                     if ( ( ( char2 & 0xC0 ) != 0x80 ) || ( ( char3 & 0xC0 ) != 0x80 ) )
281                     {
282                         throw new UTFDataFormatException( "malformed input around byte " + ( count - 1 ) );
283                     }
284                     chararr[chararrCount++] =
285                         (char) ( ( ( c & 0x0F ) << 12 ) | ( ( char2 & 0x3F ) << 6 ) | ( ( char3 & 0x3F ) ) );
286                     break;
287 
288                 default:
289                     /* 10xx xxxx, 1111 xxxx */
290                     throw new UTFDataFormatException( "malformed input around byte " + count );
291             }
292         }
293 
294         // The number of chars produced may be less than utflen
295         return new String( chararr, 0, chararrCount );
296     }
297 
298     /**
299      * An index data read result holder
300      */
301     public static class IndexDataReadResult
302     {
303         private Date timestamp;
304 
305         private int documentCount;
306 
307         private Set<String> rootGroups;
308 
309         private Set<String> allGroups;
310 
311         public void setDocumentCount( int documentCount )
312         {
313             this.documentCount = documentCount;
314         }
315 
316         public int getDocumentCount()
317         {
318             return documentCount;
319         }
320 
321         public void setTimestamp( Date timestamp )
322         {
323             this.timestamp = timestamp;
324         }
325 
326         public Date getTimestamp()
327         {
328             return timestamp;
329         }
330 
331         public void setRootGroups( Set<String> rootGroups )
332         {
333             this.rootGroups = rootGroups;
334         }
335 
336         public Set<String> getRootGroups()
337         {
338             return rootGroups;
339         }
340 
341         public void setAllGroups( Set<String> allGroups )
342         {
343             this.allGroups = allGroups;
344         }
345 
346         public Set<String> getAllGroups()
347         {
348             return allGroups;
349         }
350 
351     }
352 
353     /**
354      * Reads index content by using a visitor. <br>
355      * The visitor is called for each read documents after it has been populated with Lucene fields.
356      *
357      * @param visitor an index data visitor
358      * @param context indexing context
359      * @return statistics about read data
360      * @throws IOException in case of an IO exception during index file access
361      */
362     public IndexDataReadResult readIndex( final IndexDataReadVisitor visitor, final IndexingContext context )
363         throws IOException
364     {
365         dis.readByte(); // data format version
366 
367         long timestamp = dis.readLong();
368 
369         Date date = null;
370 
371         if ( timestamp != -1 )
372         {
373             date = new Date( timestamp );
374         }
375 
376         int n = 0;
377 
378         Document doc;
379         while ( ( doc = readDocument() ) != null )
380         {
381             visitor.visitDocument( IndexUtils.updateDocument( doc, context, false ) );
382 
383             n++;
384         }
385 
386         IndexDataReadResult result = new IndexDataReadResult();
387         result.setDocumentCount( n );
388         result.setTimestamp( date );
389         return result;
390     }
391 
392     /**
393      * Visitor of indexed Lucene documents.
394      */
395     public interface IndexDataReadVisitor
396     {
397 
398         /**
399          * Called on each read document. The document is already populated with fields.
400          *
401          * @param document read document
402          */
403         void visitDocument( Document document );
404 
405     }
406 
407 }