View Javadoc
1   package org.apache.maven.index.updater;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.BufferedInputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.UTFDataFormatException;
29  import java.util.Date;
30  import java.util.LinkedHashSet;
31  import java.util.Set;
32  import java.util.zip.GZIPInputStream;
33  
34  import org.apache.lucene.document.Document;
35  import org.apache.lucene.document.Field;
36  import org.apache.lucene.document.FieldType;
37  import org.apache.lucene.index.IndexOptions;
38  import org.apache.lucene.index.IndexWriter;
39  import org.apache.maven.index.ArtifactInfo;
40  import org.apache.maven.index.context.IndexUtils;
41  import org.apache.maven.index.context.IndexingContext;
42  
43  /**
44   * An index data reader used to parse transfer index format.
45   *
46   * @author Eugene Kuleshov
47   */
48  public class IndexDataReader
49  {
50      private final DataInputStream dis;
51  
52      public IndexDataReader( final InputStream is )
53          throws IOException
54      {
55          // MINDEXER-13
56          // LightweightHttpWagon may have performed automatic decompression
57          // Handle it transparently
58          is.mark( 2 );
59          InputStream data;
60          if ( is.read() == 0x1f && is.read() == 0x8b ) // GZIPInputStream.GZIP_MAGIC
61          {
62              is.reset();
63              data = new BufferedInputStream( new GZIPInputStream( is, 1024 * 8 ), 1024 * 8 );
64          }
65          else
66          {
67              is.reset();
68              data = new BufferedInputStream( is, 1024 * 8 );
69          }
70  
71          this.dis = new DataInputStream( data );
72      }
73  
74      public IndexDataReadResult readIndex( IndexWriter w, IndexingContext context )
75          throws IOException
76      {
77          long timestamp = readHeader();
78  
79          Date date = null;
80  
81          if ( timestamp != -1 )
82          {
83              date = new Date( timestamp );
84  
85              IndexUtils.updateTimestamp( w.getDirectory(), date );
86          }
87  
88          int n = 0;
89  
90          Document doc;
91          Set<String> rootGroups = new LinkedHashSet<>();
92          Set<String> allGroups = new LinkedHashSet<>();
93  
94          while ( ( doc = readDocument() ) != null )
95          {
96              ArtifactInfo ai = IndexUtils.constructArtifactInfo( doc, context );
97              if ( ai != null )
98              {
99                  w.addDocument( IndexUtils.updateDocument( doc, context, false, ai ) );
100 
101                 rootGroups.add( ai.getRootGroup() );
102                 allGroups.add( ai.getGroupId() );
103 
104             }
105             else
106             {
107                 w.addDocument( doc );
108             }
109             n++;
110         }
111 
112         w.commit();
113 
114         IndexDataReadResult result = new IndexDataReadResult();
115         result.setDocumentCount( n );
116         result.setTimestamp( date );
117         result.setRootGroups( rootGroups );
118         result.setAllGroups( allGroups );
119 
120         return result;
121     }
122 
123     public long readHeader()
124         throws IOException
125     {
126         final byte hdrbyte = (byte) ( ( IndexDataWriter.VERSION << 24 ) >> 24 );
127 
128         if ( hdrbyte != dis.readByte() )
129         {
130             // data format version mismatch
131             throw new IOException( "Provided input contains unexpected data (0x01 expected as 1st byte)!" );
132         }
133 
134         return dis.readLong();
135     }
136 
137     public Document readDocument()
138         throws IOException
139     {
140         int fieldCount;
141         try
142         {
143             fieldCount = dis.readInt();
144         }
145         catch ( EOFException ex )
146         {
147             return null; // no more documents
148         }
149 
150         Document doc = new Document();
151 
152         for ( int i = 0; i < fieldCount; i++ )
153         {
154             doc.add( readField() );
155         }
156 
157         // Fix up UINFO field wrt MINDEXER-41
158         final Field uinfoField = (Field) doc.getField( ArtifactInfo.UINFO );
159         final String info =  doc.get( ArtifactInfo.INFO );
160         if ( uinfoField != null && info != null && !info.isEmpty() )
161         {
162             final String[] splitInfo = ArtifactInfo.FS_PATTERN.split( info );
163             if ( splitInfo.length > 6 )
164             {
165                 final String extension = splitInfo[6];
166                 final String uinfoString = uinfoField.stringValue();
167                 if ( uinfoString.endsWith( ArtifactInfo.FS + ArtifactInfo.NA ) )
168                 {
169                     uinfoField.setStringValue( uinfoString + ArtifactInfo.FS + ArtifactInfo.nvl( extension ) );
170                 }
171             }
172         }
173 
174         return doc;
175     }
176 
177     private Field readField()
178         throws IOException
179     {
180         int flags = dis.read();
181 
182         FieldType fieldType = new FieldType();
183         if ( ( flags & IndexDataWriter.F_INDEXED ) > 0 )
184         {
185             boolean tokenized = ( flags & IndexDataWriter.F_TOKENIZED ) > 0;
186             fieldType.setTokenized( tokenized );
187             fieldType.setOmitNorms( !tokenized );
188             fieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
189         }
190         fieldType.setStored( ( flags & IndexDataWriter.F_STORED ) > 0 );
191 
192         String name = dis.readUTF();
193         String value = readUTF( dis );
194 
195         return new Field( name, value, fieldType );
196     }
197 
198     private static String readUTF( DataInput in )
199         throws IOException
200     {
201         int utflen = in.readInt();
202 
203         byte[] bytearr;
204         char[] chararr;
205 
206         try
207         {
208             bytearr = new byte[utflen];
209             chararr = new char[utflen];
210         }
211         catch ( OutOfMemoryError e )
212         {
213             throw new IOException( "Index data content is inappropriate (is junk?), leads to OutOfMemoryError!"
214                 + " See MINDEXER-28 for more information!", e );
215         }
216 
217         int c, char2, char3;
218         int count = 0;
219         int chararrCount = 0;
220 
221         in.readFully( bytearr, 0, utflen );
222 
223         while ( count < utflen )
224         {
225             c = bytearr[count] & 0xff;
226             if ( c > 127 )
227             {
228                 break;
229             }
230             count++;
231             chararr[chararrCount++] = (char) c;
232         }
233 
234         while ( count < utflen )
235         {
236             c = bytearr[count] & 0xff;
237             switch ( c >> 4 )
238             {
239                 case 0:
240                 case 1:
241                 case 2:
242                 case 3:
243                 case 4:
244                 case 5:
245                 case 6:
246                 case 7:
247                     /* 0xxxxxxx */
248                     count++;
249                     chararr[chararrCount++] = (char) c;
250                     break;
251 
252                 case 12:
253                 case 13:
254                     /* 110x xxxx 10xx xxxx */
255                     count += 2;
256                     if ( count > utflen )
257                     {
258                         throw new UTFDataFormatException( "malformed input: partial character at end" );
259                     }
260                     char2 = bytearr[count - 1];
261                     if ( ( char2 & 0xC0 ) != 0x80 )
262                     {
263                         throw new UTFDataFormatException( "malformed input around byte " + count );
264                     }
265                     chararr[chararrCount++] = (char) ( ( ( c & 0x1F ) << 6 ) | ( char2 & 0x3F ) );
266                     break;
267 
268                 case 14:
269                     /* 1110 xxxx 10xx xxxx 10xx xxxx */
270                     count += 3;
271                     if ( count > utflen )
272                     {
273                         throw new UTFDataFormatException( "malformed input: partial character at end" );
274                     }
275                     char2 = bytearr[count - 2];
276                     char3 = bytearr[count - 1];
277                     if ( ( ( char2 & 0xC0 ) != 0x80 ) || ( ( char3 & 0xC0 ) != 0x80 ) )
278                     {
279                         throw new UTFDataFormatException( "malformed input around byte " + ( count - 1 ) );
280                     }
281                     chararr[chararrCount++] =
282                         (char) ( ( ( c & 0x0F ) << 12 ) | ( ( char2 & 0x3F ) << 6 ) | ( ( char3 & 0x3F ) ) );
283                     break;
284 
285                 default:
286                     /* 10xx xxxx, 1111 xxxx */
287                     throw new UTFDataFormatException( "malformed input around byte " + count );
288             }
289         }
290 
291         // The number of chars produced may be less than utflen
292         return new String( chararr, 0, chararrCount );
293     }
294 
295     /**
296      * An index data read result holder
297      */
298     public static class IndexDataReadResult
299     {
300         private Date timestamp;
301 
302         private int documentCount;
303 
304         private Set<String> rootGroups;
305 
306         private Set<String> allGroups;
307 
308         public void setDocumentCount( int documentCount )
309         {
310             this.documentCount = documentCount;
311         }
312 
313         public int getDocumentCount()
314         {
315             return documentCount;
316         }
317 
318         public void setTimestamp( Date timestamp )
319         {
320             this.timestamp = timestamp;
321         }
322 
323         public Date getTimestamp()
324         {
325             return timestamp;
326         }
327 
328         public void setRootGroups( Set<String> rootGroups )
329         {
330             this.rootGroups = rootGroups;
331         }
332 
333         public Set<String> getRootGroups()
334         {
335             return rootGroups;
336         }
337 
338         public void setAllGroups( Set<String> allGroups )
339         {
340             this.allGroups = allGroups;
341         }
342 
343         public Set<String> getAllGroups()
344         {
345             return allGroups;
346         }
347 
348     }
349 
350     /**
351      * Reads index content by using a visitor. <br>
352      * The visitor is called for each read documents after it has been populated with Lucene fields.
353      *
354      * @param visitor an index data visitor
355      * @param context indexing context
356      * @return statistics about read data
357      * @throws IOException in case of an IO exception during index file access
358      */
359     public IndexDataReadResult readIndex( final IndexDataReadVisitor visitor, final IndexingContext context )
360         throws IOException
361     {
362         dis.readByte(); // data format version
363 
364         long timestamp = dis.readLong();
365 
366         Date date = null;
367 
368         if ( timestamp != -1 )
369         {
370             date = new Date( timestamp );
371         }
372 
373         int n = 0;
374 
375         Document doc;
376         while ( ( doc = readDocument() ) != null )
377         {
378             visitor.visitDocument( IndexUtils.updateDocument( doc, context, false ) );
379 
380             n++;
381         }
382 
383         IndexDataReadResult result = new IndexDataReadResult();
384         result.setDocumentCount( n );
385         result.setTimestamp( date );
386         return result;
387     }
388 
389     /**
390      * Visitor of indexed Lucene documents.
391      */
392     public interface IndexDataReadVisitor
393     {
394 
395         /**
396          * Called on each read document. The document is already populated with fields.
397          *
398          * @param document read document
399          */
400         void visitDocument( Document document );
401 
402     }
403 
404 }