View Javadoc

1   package org.apache.maven.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0    
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.IOException;
23  import java.io.StringReader;
24  
25  import org.apache.lucene.analysis.TokenStream;
26  import org.apache.lucene.index.Term;
27  import org.apache.lucene.queryParser.ParseException;
28  import org.apache.lucene.queryParser.QueryParser;
29  import org.apache.lucene.queryParser.QueryParser.Operator;
30  import org.apache.lucene.search.BooleanClause.Occur;
31  import org.apache.lucene.search.BooleanQuery;
32  import org.apache.lucene.search.PrefixQuery;
33  import org.apache.lucene.search.Query;
34  import org.apache.lucene.search.TermQuery;
35  import org.apache.lucene.search.WildcardQuery;
36  import org.apache.lucene.util.Version;
37  import org.apache.maven.index.context.NexusAnalyzer;
38  import org.apache.maven.index.creator.JarFileContentsIndexCreator;
39  import org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator;
40  import org.apache.maven.index.expr.SearchExpression;
41  import org.apache.maven.index.expr.SearchTyped;
42  import org.codehaus.plexus.component.annotations.Component;
43  import org.codehaus.plexus.component.annotations.Requirement;
44  import org.codehaus.plexus.logging.Logger;
45  
46  /**
47   * A default {@link QueryCreator} constructs Lucene query for provided query text.
48   * <p>
49   * By default wildcards are created such as query text matches beginning of the field value or beginning of the
50   * class/package name segment for {@link ArtifactInfo#NAMES NAMES} field. But it can be controlled by using special
51   * markers:
52   * <ul>
53   * <li>* - any character</li>
54   * <li>'^' - beginning of the text</li>
55   * <li>'$' or '&lt;' or ' ' end of the text</li>
56   * </ul>
57   * For example:
58   * <ul>
59   * <li>junit - matches junit and junit-foo, but not foo-junit</li>
60   * <li>*junit - matches junit, junit-foo and foo-junit</li>
61   * <li>^junit$ - matches junit, but not junit-foo, nor foo-junit</li>
62   * </ul>
63   * 
64   * @author Eugene Kuleshov
65   */
66  @Component( role = QueryCreator.class )
67  public class DefaultQueryCreator
68      implements QueryCreator
69  {
70      @Requirement
71      private Logger logger;
72  
73      protected Logger getLogger()
74      {
75          return logger;
76      }
77  
78      // ==
79  
80      public IndexerField selectIndexerField( final Field field, final SearchType type )
81      {
82          IndexerField lastField = null;
83  
84          for ( IndexerField indexerField : field.getIndexerFields() )
85          {
86              lastField = indexerField;
87  
88              if ( type.matchesIndexerField( indexerField ) )
89              {
90                  return indexerField;
91              }
92          }
93  
94          return lastField;
95      }
96  
97      public Query constructQuery( final Field field, final SearchExpression expression )
98          throws ParseException
99      {
100         SearchType searchType = SearchType.SCORED;
101 
102         if ( expression instanceof SearchTyped )
103         {
104             searchType = ( (SearchTyped) expression ).getSearchType();
105         }
106 
107         return constructQuery( field, expression.getStringValue(), searchType );
108     }
109 
110     public Query constructQuery( final Field field, final String query, final SearchType type )
111         throws ParseException
112     {
113         if ( type == null )
114         {
115             throw new NullPointerException( "Cannot construct query with type of \"null\"!" );
116         }
117 
118         if ( field == null )
119         {
120             throw new NullPointerException( "Cannot construct query for field \"null\"!" );
121         }
122         else
123         {
124             return constructQuery( field, selectIndexerField( field, type ), query, type );
125         }
126     }
127 
128     @Deprecated
129     public Query constructQuery( String field, String query )
130     {
131         Query result = null;
132 
133         if ( MinimalArtifactInfoIndexCreator.FLD_GROUP_ID_KW.getKey().equals( field )
134             || MinimalArtifactInfoIndexCreator.FLD_ARTIFACT_ID_KW.getKey().equals( field )
135             || MinimalArtifactInfoIndexCreator.FLD_VERSION_KW.getKey().equals( field )
136             || JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals( field ) )
137         {
138             // these are special untokenized fields, kept for use cases like TreeView is (exact matching).
139             result = legacyConstructQuery( field, query );
140         }
141         else
142         {
143             QueryParser qp = new QueryParser( Version.LUCENE_24, field, new NexusAnalyzer() );
144 
145             // small cheap trick
146             // if a query is not "expert" (does not contain field:val kind of expression)
147             // but it contains star and/or punctuation chars, example: "common-log*"
148             if ( !query.contains( ":" ) )
149             {
150                 if ( query.contains( "*" ) && query.matches( ".*(\\.|-|_).*" ) )
151                 {
152                     query =
153                         query.toLowerCase().replaceAll( "\\*", "X" ).replaceAll( "\\.|-|_", " " ).replaceAll( "X", "*" );
154                 }
155             }
156 
157             try
158             {
159                 result = qp.parse( query );
160             }
161             catch ( ParseException e )
162             {
163                 getLogger().debug(
164                     "Query parsing with \"legacy\" method, we got ParseException from QueryParser: " + e.getMessage() );
165 
166                 result = legacyConstructQuery( field, query );
167             }
168         }
169 
170         if ( getLogger().isDebugEnabled() )
171         {
172             getLogger().debug( "Query parsed as: " + result.toString() );
173         }
174 
175         return result;
176     }
177 
178     // ==
179 
180     public Query constructQuery( final Field field, final IndexerField indexerField, final String query,
181                                  final SearchType type )
182         throws ParseException
183     {
184         if ( indexerField == null )
185         {
186             getLogger().warn(
187                 "Querying for field \""
188                     + field.toString()
189                     + "\" without any indexer field was tried. Please review your code, and consider adding this field to index!" );
190 
191             return null;
192         }
193         if ( !indexerField.isIndexed() )
194         {
195             getLogger().warn(
196                 "Querying for non-indexed field " + field.toString()
197                     + " was tried. Please review your code or consider adding this field to index!" );
198 
199             return null;
200         }
201 
202         if ( query.startsWith( "*" ) || query.startsWith( "?" ) )
203         {
204             throw new ParseException( "Query cannot start with '*' or '?'!" );
205         }
206 
207         if ( Field.NOT_PRESENT.equals( query ) )
208         {
209             return new WildcardQuery( new Term( indexerField.getKey(), "*" ) );
210         }
211 
212         if ( SearchType.EXACT.equals( type ) )
213         {
214             if ( indexerField.isKeyword() )
215             {
216                 // no tokenization should happen against the field!
217                 if ( query.contains( "*" ) || query.contains( "?" ) )
218                 {
219                     return new WildcardQuery( new Term( indexerField.getKey(), query ) );
220                 }
221                 else
222                 {
223                     // exactly what callee wants
224                     return new TermQuery( new Term( indexerField.getKey(), query ) );
225                 }
226             }
227             else if ( !indexerField.isKeyword() && indexerField.isStored() )
228             {
229                 // TODO: resolve this better! Decouple QueryCreator and IndexCreators!
230                 // This is a hack/workaround here
231                 if ( JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.equals( indexerField ) )
232                 {
233                     if ( query.startsWith( "/" ) )
234                     {
235                         return new TermQuery( new Term( indexerField.getKey(), query.toLowerCase().replaceAll( "\\.",
236                             "/" ) ) );
237                     }
238                     else
239                     {
240                         return new TermQuery( new Term( indexerField.getKey(), "/"
241                             + query.toLowerCase().replaceAll( "\\.", "/" ) ) );
242                     }
243                 }
244                 else
245                 {
246                     getLogger().warn(
247                         type.toString()
248                             + " type of querying for non-keyword (but stored) field "
249                             + indexerField.getOntology().toString()
250                             + " was tried. Please review your code, or indexCreator involved, since this type of querying of this field is currently unsupported." );
251 
252                     // will never succeed (unless we supply him "filter" too, but that would kill performance)
253                     // and is possible with stored fields only
254                     return null;
255                 }
256             }
257             else
258             {
259                 getLogger().warn(
260                     type.toString()
261                         + " type of querying for non-keyword (and not stored) field "
262                         + indexerField.getOntology().toString()
263                         + " was tried. Please review your code, or indexCreator involved, since this type of querying of this field is impossible." );
264 
265                 // not a keyword indexerField, nor stored. No hope at all. Impossible even with "filtering"
266                 return null;
267             }
268         }
269         else if ( SearchType.SCORED.equals( type ) )
270         {
271             if ( JarFileContentsIndexCreator.FLD_CLASSNAMES.equals( indexerField ) )
272             {
273                 String qpQuery = query.toLowerCase().replaceAll( "\\.", " " ).replaceAll( "/", " " );
274                 // tokenization should happen against the field!
275                 QueryParser qp = new QueryParser( Version.LUCENE_30, indexerField.getKey(), new NexusAnalyzer() );
276                 qp.setDefaultOperator( Operator.AND );
277                 return qp.parse( qpQuery );
278             }
279             else if ( indexerField.isKeyword() )
280             {
281                 // no tokenization should happen against the field!
282                 if ( query.contains( "*" ) || query.contains( "?" ) )
283                 {
284                     return new WildcardQuery( new Term( indexerField.getKey(), query ) );
285                 }
286                 else
287                 {
288                     BooleanQuery bq = new BooleanQuery();
289 
290                     Term t = new Term( indexerField.getKey(), query );
291 
292                     bq.add( new TermQuery( t ), Occur.SHOULD );
293 
294                     PrefixQuery pq = new PrefixQuery( t );
295                     pq.setBoost( 0.8f );
296 
297                     bq.add( pq, Occur.SHOULD );
298 
299                     return bq;
300                 }
301             }
302             else
303             {
304                 // to save "original" query
305                 String qpQuery = query;
306 
307                 // tokenization should happen against the field!
308                 QueryParser qp = new QueryParser( Version.LUCENE_30, indexerField.getKey(), new NexusAnalyzer() );
309                 qp.setDefaultOperator( Operator.AND );
310 
311                 // small cheap trick
312                 // if a query is not "expert" (does not contain field:val kind of expression)
313                 // but it contains star and/or punctuation chars, example: "common-log*"
314                 // since Lucene does not support multi-terms WITH wildcards.
315                 // So, here, we "mimic" NexusAnalyzer (this should be fixed!)
316                 // but do this with PRESERVING original query!
317                 if ( qpQuery.matches( ".*(\\.|-|_|/).*" ) )
318                 {
319                     qpQuery =
320                         qpQuery.toLowerCase().replaceAll( "\\*", "X" ).replaceAll( "\\.|-|_|/", " " ).replaceAll( "X",
321                             "*" ).replaceAll( " \\* ", "" ).replaceAll( "^\\* ", "" ).replaceAll( " \\*$", "" );
322                 }
323 
324                 // "fix" it with trailing "*" if not there, but only if it not ends with a space
325                 if ( !qpQuery.endsWith( "*" ) && !qpQuery.endsWith( " " ) )
326                 {
327                     qpQuery += "*";
328                 }
329 
330                 try
331                 {
332                     // qpQuery = "\"" + qpQuery + "\"";
333 
334                     BooleanQuery q1 = new BooleanQuery();
335 
336                     q1.add( qp.parse( qpQuery ), Occur.SHOULD );
337 
338                     if ( qpQuery.contains( " " ) )
339                     {
340                         q1.add( qp.parse( "\"" + qpQuery + "\"" ), Occur.SHOULD );
341                     }
342 
343                     Query q2 = null;
344 
345                     int termCount = countTerms( indexerField, query );
346 
347                     // try with KW only if the processed query in qpQuery does not have spaces!
348                     if ( !query.contains( " " ) && termCount > 1 )
349                     {
350                         // get the KW field
351                         IndexerField keywordField = selectIndexerField( indexerField.getOntology(), SearchType.EXACT );
352 
353                         if ( keywordField.isKeyword() )
354                         {
355                             q2 = constructQuery( indexerField.getOntology(), keywordField, query, type );
356                         }
357                     }
358 
359                     if ( q2 == null )
360                     {
361                         return q1;
362                     }
363                     else
364                     {
365                         BooleanQuery bq = new BooleanQuery();
366 
367                         // trick with order
368                         bq.add( q2, Occur.SHOULD );
369                         bq.add( q1, Occur.SHOULD );
370 
371                         return bq;
372                     }
373                 }
374                 catch ( ParseException e )
375                 {
376                     // TODO: we are not falling back anymore to legacy!
377                     throw e;
378 
379                     // getLogger().debug(
380                     // "Query parsing with \"legacy\" method, we got ParseException from QueryParser: "
381                     // + e.getMessage() );
382                     //
383                     // return legacyConstructQuery( indexerField.getKey(), query );
384                 }
385             }
386         }
387         else
388         {
389             // what search type is this?
390             return null;
391         }
392     }
393 
394     public Query legacyConstructQuery( String field, String query )
395     {
396         if ( query == null || query.length() == 0 )
397         {
398             getLogger().info( "Empty or null query for field:" + field );
399 
400             return null;
401         }
402 
403         String q = query.toLowerCase();
404 
405         char h = query.charAt( 0 );
406 
407         if ( JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals( field )
408             || JarFileContentsIndexCreator.FLD_CLASSNAMES.getKey().equals( field ) )
409         {
410             q = q.replaceAll( "\\.", "/" );
411 
412             if ( h == '^' )
413             {
414                 q = q.substring( 1 );
415 
416                 if ( q.charAt( 0 ) != '/' )
417                 {
418                     q = '/' + q;
419                 }
420             }
421             else if ( h != '*' )
422             {
423                 q = "*/" + q;
424             }
425         }
426         else
427         {
428             if ( h == '^' )
429             {
430                 q = q.substring( 1 );
431             }
432             else if ( h != '*' )
433             {
434                 q = "*" + q;
435             }
436         }
437 
438         int l = q.length() - 1;
439         char c = q.charAt( l );
440         if ( c == ' ' || c == '<' || c == '$' )
441         {
442             q = q.substring( 0, q.length() - 1 );
443         }
444         else if ( c != '*' )
445         {
446             q += "*";
447         }
448 
449         int n = q.indexOf( '*' );
450         if ( n == -1 )
451         {
452             return new TermQuery( new Term( field, q ) );
453         }
454         else if ( n > 0 && n == q.length() - 1 )
455         {
456             return new PrefixQuery( new Term( field, q.substring( 0, q.length() - 1 ) ) );
457         }
458 
459         return new WildcardQuery( new Term( field, q ) );
460     }
461 
462     // ==
463 
464     private NexusAnalyzer nexusAnalyzer = new NexusAnalyzer();
465 
466     protected int countTerms( final IndexerField indexerField, final String query )
467     {
468         try
469         {
470             TokenStream ts = nexusAnalyzer.reusableTokenStream( indexerField.getKey(), new StringReader( query ) );
471 
472             int result = 0;
473 
474             while ( ts.incrementToken() )
475             {
476                 result++;
477             }
478 
479             return result;
480         }
481         catch ( IOException e )
482         {
483             // will not happen
484             return 1;
485         }
486     }
487 }