View Javadoc
1   package org.apache.maven.index;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0    
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import javax.inject.Named;
23  import javax.inject.Singleton;
24  import java.io.IOException;
25  import java.io.StringReader;
26  import org.apache.lucene.analysis.TokenStream;
27  import org.apache.lucene.index.Term;
28  import org.apache.lucene.queryparser.classic.ParseException;
29  import org.apache.lucene.queryparser.classic.QueryParser;
30  import org.apache.lucene.queryparser.classic.QueryParser.Operator;
31  import org.apache.lucene.search.BooleanClause.Occur;
32  import org.apache.lucene.search.BooleanQuery;
33  import org.apache.lucene.search.BoostQuery;
34  import org.apache.lucene.search.PrefixQuery;
35  import org.apache.lucene.search.Query;
36  import org.apache.lucene.search.TermQuery;
37  import org.apache.lucene.search.WildcardQuery;
38  import org.apache.maven.index.context.NexusAnalyzer;
39  import org.apache.maven.index.creator.JarFileContentsIndexCreator;
40  import org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator;
41  import org.apache.maven.index.expr.SearchExpression;
42  import org.apache.maven.index.expr.SearchTyped;
43  import org.slf4j.Logger;
44  import org.slf4j.LoggerFactory;
45  
46  /**
47   * A default {@link QueryCreator} constructs Lucene query for provided query text.
48   * <p>
49   * By default wildcards are created such as query text matches beginning of the field value or beginning of the
50   * class/package name segment for {@link ArtifactInfo#NAMES NAMES} field. But it can be controlled by using special
51   * markers:
52   * <ul>
53   * <li>* - any character</li>
54   * <li>'^' - beginning of the text</li>
55   * <li>'$' or '&lt;' or ' ' end of the text</li>
56   * </ul>
57   * For example:
58   * <ul>
59   * <li>junit - matches junit and junit-foo, but not foo-junit</li>
60   * <li>*junit - matches junit, junit-foo and foo-junit</li>
61   * <li>^junit$ - matches junit, but not junit-foo, nor foo-junit</li>
62   * </ul>
63   * 
64   * @author Eugene Kuleshov
65   */
66  @Singleton
67  @Named
68  public class DefaultQueryCreator
69      implements QueryCreator
70  {
71  
72      private final Logger logger = LoggerFactory.getLogger( getClass() );
73  
74      protected Logger getLogger()
75      {
76          return logger;
77      }
78  
79      // ==
80  
81      public IndexerField selectIndexerField( final Field field, final SearchType type )
82      {
83          IndexerField lastField = null;
84  
85          for ( IndexerField indexerField : field.getIndexerFields() )
86          {
87              lastField = indexerField;
88  
89              if ( type.matchesIndexerField( indexerField ) )
90              {
91                  return indexerField;
92              }
93          }
94  
95          return lastField;
96      }
97  
98      public Query constructQuery( final Field field, final SearchExpression expression )
99          throws ParseException
100     {
101         SearchType searchType = SearchType.SCORED;
102 
103         if ( expression instanceof SearchTyped )
104         {
105             searchType = ( (SearchTyped) expression ).getSearchType();
106         }
107 
108         return constructQuery( field, expression.getStringValue(), searchType );
109     }
110 
111     public Query constructQuery( final Field field, final String query, final SearchType type )
112         throws ParseException
113     {
114         if ( type == null )
115         {
116             throw new NullPointerException( "Cannot construct query with type of \"null\"!" );
117         }
118 
119         if ( field == null )
120         {
121             throw new NullPointerException( "Cannot construct query for field \"null\"!" );
122         }
123         else
124         {
125             return constructQuery( field, selectIndexerField( field, type ), query, type );
126         }
127     }
128 
129     @Deprecated
130     public Query constructQuery( String field, String query )
131     {
132         Query result;
133 
134         if ( MinimalArtifactInfoIndexCreator.FLD_GROUP_ID_KW.getKey().equals( field )
135             || MinimalArtifactInfoIndexCreator.FLD_ARTIFACT_ID_KW.getKey().equals( field )
136             || MinimalArtifactInfoIndexCreator.FLD_VERSION_KW.getKey().equals( field )
137             || JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals( field ) )
138         {
139             // these are special untokenized fields, kept for use cases like TreeView is (exact matching).
140             result = legacyConstructQuery( field, query );
141         }
142         else
143         {
144             QueryParser qp = new QueryParser( field, new NexusAnalyzer() );
145 
146             // small cheap trick
147             // if a query is not "expert" (does not contain field:val kind of expression)
148             // but it contains star and/or punctuation chars, example: "common-log*"
149             if ( !query.contains( ":" ) )
150             {
151                 if ( query.contains( "*" ) && query.matches( ".*(\\.|-|_).*" ) )
152                 {
153                     query = query.toLowerCase().replaceAll( "\\*", "X" ).replaceAll( "\\.|-|_", " " ).replaceAll( "X",
154                                                                                                                   "*" );
155                 }
156             }
157 
158             try
159             {
160                 result = qp.parse( query );
161             }
162             catch ( ParseException e )
163             {
164                 getLogger().debug(
165                     "Query parsing with \"legacy\" method, we got ParseException from QueryParser: " + e.getMessage() );
166 
167                 result = legacyConstructQuery( field, query );
168             }
169         }
170 
171         if ( getLogger().isDebugEnabled() )
172         {
173             getLogger().debug( "Query parsed as: " + result.toString() );
174         }
175 
176         return result;
177     }
178 
179     // ==
180 
181     public Query constructQuery( final Field field, final IndexerField indexerField, final String query,
182                                  final SearchType type )
183         throws ParseException
184     {
185         if ( indexerField == null )
186         {
187             getLogger().warn( "Querying for field \"" + field.toString() + "\" without any indexer field was tried. "
188                 + "Please review your code, and consider adding this field to index!" );
189 
190             return null;
191         }
192         if ( !indexerField.isIndexed() )
193         {
194             getLogger().warn(
195                 "Querying for non-indexed field " + field.toString()
196                     + " was tried. Please review your code or consider adding this field to index!" );
197 
198             return null;
199         }
200 
201         if ( Field.NOT_PRESENT.equals( query ) )
202         {
203             return new WildcardQuery( new Term( indexerField.getKey(), "*" ) );
204         }
205 
206         if ( SearchType.EXACT.equals( type ) )
207         {
208             if ( indexerField.isKeyword() )
209             {
210                 // no tokenization should happen against the field!
211                 if ( query.contains( "*" ) || query.contains( "?" ) )
212                 {
213                     return new WildcardQuery( new Term( indexerField.getKey(), query ) );
214                 }
215                 else
216                 {
217                     // exactly what callee wants
218                     return new TermQuery( new Term( indexerField.getKey(), query ) );
219                 }
220             }
221             else if ( !indexerField.isKeyword() && indexerField.isStored() )
222             {
223                 // TODO: resolve this better! Decouple QueryCreator and IndexCreators!
224                 // This is a hack/workaround here
225                 if ( JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.equals( indexerField ) )
226                 {
227                     if ( query.startsWith( "/" ) )
228                     {
229                         return new TermQuery( new Term( indexerField.getKey(), query.toLowerCase().replaceAll( "\\.",
230                             "/" ) ) );
231                     }
232                     else
233                     {
234                         return new TermQuery( new Term( indexerField.getKey(), "/"
235                             + query.toLowerCase().replaceAll( "\\.", "/" ) ) );
236                     }
237                 }
238                 else
239                 {
240                     getLogger().warn(
241                         type.toString()
242                             + " type of querying for non-keyword (but stored) field "
243                             + indexerField.getOntology().toString()
244                             + " was tried. Please review your code, or indexCreator involved, "
245                             + "since this type of querying of this field is currently unsupported." );
246 
247                     // will never succeed (unless we supply him "filter" too, but that would kill performance)
248                     // and is possible with stored fields only
249                     return null;
250                 }
251             }
252             else
253             {
254                 getLogger().warn(
255                     type.toString()
256                         + " type of querying for non-keyword (and not stored) field "
257                         + indexerField.getOntology().toString()
258                         + " was tried. Please review your code, or indexCreator involved, "
259                         + "since this type of querying of this field is impossible." );
260 
261                 // not a keyword indexerField, nor stored. No hope at all. Impossible even with "filtering"
262                 return null;
263             }
264         }
265         else if ( SearchType.SCORED.equals( type ) )
266         {
267             if ( JarFileContentsIndexCreator.FLD_CLASSNAMES.equals( indexerField ) )
268             {
269                 String qpQuery = query.toLowerCase().replaceAll( "\\.", " " ).replaceAll( "/", " " );
270                 // tokenization should happen against the field!
271                 QueryParser qp = new QueryParser( indexerField.getKey(), new NexusAnalyzer() );
272                 qp.setDefaultOperator( Operator.AND );
273                 return qp.parse( qpQuery );
274             }
275             else if ( indexerField.isKeyword() )
276             {
277                 // no tokenization should happen against the field!
278                 if ( query.contains( "*" ) || query.contains( "?" ) )
279                 {
280                     return new WildcardQuery( new Term( indexerField.getKey(), query ) );
281                 }
282                 else
283                 {
284                     Term t = new Term( indexerField.getKey(), query );
285                     return new BooleanQuery.Builder()
286                         .add( new TermQuery( t ), Occur.SHOULD )
287                         .add( new BoostQuery( new PrefixQuery( t ), 0.8f ), Occur.SHOULD )
288                         .build();
289             }
290             }
291             else
292             {
293                 // to save "original" query
294                 String qpQuery = query;
295 
296                 // tokenization should happen against the field!
297                 QueryParser qp = new QueryParser( indexerField.getKey(), new NexusAnalyzer() );
298                 qp.setDefaultOperator( Operator.AND );
299 
300                 // small cheap trick
301                 // if a query is not "expert" (does not contain field:val kind of expression)
302                 // but it contains star and/or punctuation chars, example: "common-log*"
303                 // since Lucene does not support multi-terms WITH wildcards.
304                 // So, here, we "mimic" NexusAnalyzer (this should be fixed!)
305                 // but do this with PRESERVING original query!
306                 if ( qpQuery.matches( ".*(\\.|-|_|/).*" ) )
307                 {
308                     qpQuery =
309                         qpQuery.toLowerCase().replaceAll( "\\*", "X" ).replaceAll( "\\.|-|_|/", " " ).replaceAll( "X",
310                             "*" ).replaceAll( " \\* ", "" ).replaceAll( "^\\* ", "" ).replaceAll( " \\*$", "" );
311                 }
312 
313                 // "fix" it with trailing "*" if not there, but only if it not ends with a space
314                 if ( !qpQuery.endsWith( "*" ) && !qpQuery.endsWith( " " ) )
315                 {
316                     qpQuery += "*";
317                 }
318 
319                 try
320                 {
321                     // qpQuery = "\"" + qpQuery + "\"";
322 
323                     BooleanQuery.Builder q1b = new BooleanQuery.Builder()
324                             .add( qp.parse( qpQuery ), Occur.SHOULD );
325 
326                     if ( qpQuery.contains( " " ) )
327                     {
328                         q1b.add( qp.parse( "\"" + qpQuery + "\"" ), Occur.SHOULD );
329                     }
330 
331                     Query q2 = null;
332 
333                     int termCount = countTerms( indexerField, query );
334 
335                     // try with KW only if the processed query in qpQuery does not have spaces!
336                     if ( !query.contains( " " ) && termCount > 1 )
337                     {
338                         // get the KW field
339                         IndexerField keywordField = selectIndexerField( indexerField.getOntology(), SearchType.EXACT );
340 
341                         if ( keywordField.isKeyword() )
342                         {
343                             q2 = constructQuery( indexerField.getOntology(), keywordField, query, type );
344                         }
345                     }
346 
347                     if ( q2 == null )
348                     {
349                         return q1b.build();
350                     }
351                     else
352                     {
353                         return new BooleanQuery.Builder()
354                             // trick with order
355                             .add( q2, Occur.SHOULD )
356                             .add( q1b.build(), Occur.SHOULD )
357                             .build();
358                     }
359                 }
360                 catch ( ParseException e )
361                 {
362                     // TODO: we are not falling back anymore to legacy!
363                     throw e;
364 
365                     // getLogger().debug(
366                     // "Query parsing with \"legacy\" method, we got ParseException from QueryParser: "
367                     // + e.getMessage() );
368                     //
369                     // return legacyConstructQuery( indexerField.getKey(), query );
370                 }
371             }
372         }
373         else
374         {
375             // what search type is this?
376             return null;
377         }
378     }
379 
380     public Query legacyConstructQuery( String field, String query )
381     {
382         if ( query == null || query.length() == 0 )
383         {
384             getLogger().info( "Empty or null query for field:" + field );
385 
386             return null;
387         }
388 
389         String q = query.toLowerCase();
390 
391         char h = query.charAt( 0 );
392 
393         if ( JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals( field )
394             || JarFileContentsIndexCreator.FLD_CLASSNAMES.getKey().equals( field ) )
395         {
396             q = q.replaceAll( "\\.", "/" );
397 
398             if ( h == '^' )
399             {
400                 q = q.substring( 1 );
401 
402                 if ( q.charAt( 0 ) != '/' )
403                 {
404                     q = '/' + q;
405                 }
406             }
407             else if ( h != '*' )
408             {
409                 q = "*/" + q;
410             }
411         }
412         else
413         {
414             if ( h == '^' )
415             {
416                 q = q.substring( 1 );
417             }
418             else if ( h != '*' )
419             {
420                 q = "*" + q;
421             }
422         }
423 
424         int l = q.length() - 1;
425         char c = q.charAt( l );
426         if ( c == ' ' || c == '<' || c == '$' )
427         {
428             q = q.substring( 0, q.length() - 1 );
429         }
430         else if ( c != '*' )
431         {
432             q += "*";
433         }
434 
435         int n = q.indexOf( '*' );
436         if ( n == -1 )
437         {
438             return new TermQuery( new Term( field, q ) );
439         }
440         else if ( n > 0 && n == q.length() - 1 )
441         {
442             return new PrefixQuery( new Term( field, q.substring( 0, q.length() - 1 ) ) );
443         }
444 
445         return new WildcardQuery( new Term( field, q ) );
446     }
447 
448     // ==
449 
450     private NexusAnalyzer nexusAnalyzer = new NexusAnalyzer();
451 
452     protected int countTerms( final IndexerField indexerField, final String query )
453     {
454         try
455         {
456             TokenStream ts = nexusAnalyzer.tokenStream( indexerField.getKey(), new StringReader( query ) );
457             ts.reset();
458 
459             int result = 0;
460 
461             while ( ts.incrementToken() )
462             {
463                 result++;
464             }
465             
466             ts.end();
467             ts.close();
468 
469             return result;
470         }
471         catch ( IOException e )
472         {
473             // will not happen
474             return 1;
475         }
476     }
477 }