View Javadoc

1   package org.apache.maven.wagon.shared.http4;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.io.IOUtils;
23  import org.apache.maven.wagon.TransferFailedException;
24  import org.codehaus.plexus.util.StringUtils;
25  import org.jsoup.Jsoup;
26  import org.jsoup.nodes.Document;
27  import org.jsoup.nodes.Element;
28  import org.jsoup.select.Elements;
29  
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.io.UnsupportedEncodingException;
33  import java.net.URI;
34  import java.net.URISyntaxException;
35  import java.net.URLDecoder;
36  import java.util.ArrayList;
37  import java.util.HashSet;
38  import java.util.List;
39  import java.util.Set;
40  import java.util.regex.Pattern;
41  
42  /**
43   * Html File List Parser.
44   */
45  public class HtmlFileListParser
46  {
47      // Apache Fancy Index Sort Headers
48      private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
49  
50      // URLs with excessive paths.
51      private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
52  
53      // URLs that to a parent directory.
54      private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
55  
56      // mailto urls
57      private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
58  
59      private static final Pattern[] SKIPS =
60          new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
61  
62      /**
63       * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
64       *
65       * @param stream the input stream.
66       * @return the file list.
67       * @throws TransferFailedException if there was a problem fetching the raw html.
68       */
69      public static List<String> parseFileList( String baseurl, InputStream stream )
70          throws TransferFailedException
71      {
72          try
73          {
74              URI baseURI = new URI( baseurl );
75              // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
76              // assumption.
77              String content = IOUtils.toString( stream, "utf-8" );
78              Document doc = Jsoup.parse( content, baseurl );
79              Elements links = doc.select("a[href]");
80              Set<String> results = new HashSet<String>();
81              for ( int lx = 0; lx < links.size(); lx++ )
82              {
83                  Element link = links.get( lx );
84                  /*
85                   * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
86                   */
87                  String target = link.attr( "href" );
88                  if ( target != null )
89                  {
90                      String clean = cleanLink( baseURI, target );
91                      if ( isAcceptableLink( clean ) )
92                      {
93                          results.add( clean );
94                      }
95                  }
96  
97              }
98  
99              return new ArrayList<String>( results );
100         }
101         catch ( URISyntaxException e )
102         {
103             throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
104         }
105         catch ( IOException e )
106         {
107             throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
108         }
109     }
110 
111     private static String cleanLink( URI baseURI, String link )
112     {
113         if ( StringUtils.isEmpty( link ) )
114         {
115             return "";
116         }
117 
118         String ret = link;
119 
120         try
121         {
122             URI linkuri = new URI( ret );
123             if ( link.startsWith( "/" ) )
124             {
125                 linkuri = baseURI.resolve( linkuri );
126             }
127             URI relativeURI = baseURI.relativize( linkuri ).normalize();
128             ret = relativeURI.toASCIIString();
129             if ( ret.startsWith( baseURI.getPath() ) )
130             {
131                 ret = ret.substring( baseURI.getPath().length() );
132             }
133 
134             ret = URLDecoder.decode( ret, "UTF-8" );
135         }
136         catch ( URISyntaxException e )
137         {
138         }
139         catch ( UnsupportedEncodingException e )
140         {
141         }
142 
143         return ret;
144     }
145 
146     private static boolean isAcceptableLink( String link )
147     {
148         if ( StringUtils.isEmpty( link ) )
149         {
150             return false;
151         }
152 
153         for ( int i = 0; i < SKIPS.length; i++ )
154         {
155             if ( SKIPS[i].matcher( link ).find() )
156             {
157                 return false;
158             }
159         }
160 
161         return true;
162     }
163 
164 }