View Javadoc
1   package org.apache.maven.wagon.shared.http;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.io.IOUtils;
23  import org.apache.maven.wagon.TransferFailedException;
24  import org.jsoup.Jsoup;
25  import org.jsoup.nodes.Document;
26  import org.jsoup.nodes.Element;
27  import org.jsoup.select.Elements;
28  
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.io.UnsupportedEncodingException;
32  import java.net.URI;
33  import java.net.URISyntaxException;
34  import java.net.URLDecoder;
35  import java.util.ArrayList;
36  import java.util.HashSet;
37  import java.util.List;
38  import java.util.Set;
39  import java.util.regex.Pattern;
40  
41  /**
42   * Html File List Parser.
43   */
44  public class HtmlFileListParser
45  {
46      // Apache Fancy Index Sort Headers
47      private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
48  
49      // URLs with excessive paths.
50      private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
51  
52      // URLs that to a parent directory.
53      private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
54  
55      // mailto urls
56      private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
57  
58      private static final Pattern[] SKIPS =
59          new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
60  
61      /**
62       * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
63       *
64       * @param stream the input stream.
65       * @return the file list.
66       * @throws TransferFailedException if there was a problem fetching the raw html.
67       */
68      public static List<String> parseFileList( String baseurl, InputStream stream )
69          throws TransferFailedException
70      {
71          try
72          {
73              URI baseURI = new URI( baseurl );
74              // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
75              // assumption.
76              String content = IOUtils.toString( stream, "utf-8" );
77              Document doc = Jsoup.parse( content, baseurl );
78              Elements links = doc.select( "a[href]" );
79              Set<String> results = new HashSet<String>();
80              for ( Element link : links )
81              {
82                  /*
83                   * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
84                   */
85                  String target = link.attr( "href" );
86                  if ( target != null )
87                  {
88                      String clean = cleanLink( baseURI, target );
89                      if ( isAcceptableLink( clean ) )
90                      {
91                          results.add( clean );
92                      }
93                  }
94  
95              }
96  
97              return new ArrayList<String>( results );
98          }
99          catch ( URISyntaxException e )
100         {
101             throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
102         }
103         catch ( IOException e )
104         {
105             throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
106         }
107     }
108 
109     private static String cleanLink( URI baseURI, String link )
110     {
111         if ( link == null || link.length() == 0 )
112         {
113             return "";
114         }
115 
116         String ret = link;
117 
118         try
119         {
120             URI linkuri = new URI( ret );
121             if ( link.startsWith( "/" ) )
122             {
123                 linkuri = baseURI.resolve( linkuri );
124             }
125             URI relativeURI = baseURI.relativize( linkuri ).normalize();
126             ret = relativeURI.toASCIIString();
127             if ( ret.startsWith( baseURI.getPath() ) )
128             {
129                 ret = ret.substring( baseURI.getPath().length() );
130             }
131 
132             ret = URLDecoder.decode( ret, "UTF-8" );
133         }
134         catch ( URISyntaxException e )
135         {
136             // ignore
137         }
138         catch ( UnsupportedEncodingException e )
139         {
140             // ignore
141         }
142 
143         return ret;
144     }
145 
146     private static boolean isAcceptableLink( String link )
147     {
148         if ( link == null || link.length() == 0 )
149         {
150             return false;
151         }
152 
153         for ( Pattern pattern : SKIPS )
154         {
155             if ( pattern.matcher( link ).find() )
156             {
157                 return false;
158             }
159         }
160 
161         return true;
162     }
163 
164 }