View Javadoc
1   package org.apache.maven.wagon.shared.http;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.io.IOUtils;
23  import org.apache.commons.lang.StringUtils;
24  import org.apache.maven.wagon.TransferFailedException;
25  import org.jsoup.Jsoup;
26  import org.jsoup.nodes.Document;
27  import org.jsoup.nodes.Element;
28  import org.jsoup.select.Elements;
29  
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.io.UnsupportedEncodingException;
33  import java.net.URI;
34  import java.net.URISyntaxException;
35  import java.net.URLDecoder;
36  import java.util.ArrayList;
37  import java.util.HashSet;
38  import java.util.List;
39  import java.util.Set;
40  import java.util.regex.Pattern;
41  
42  /**
43   * Html File List Parser.
44   */
45  public class HtmlFileListParser
46  {
47      // Apache Fancy Index Sort Headers
48      private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
49  
50      // URLs with excessive paths.
51      private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
52  
53      // URLs that to a parent directory.
54      private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
55  
56      // mailto urls
57      private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
58  
59      private static final Pattern[] SKIPS =
60          new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
61  
62      /**
63       * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
64       *
65       * @param stream the input stream.
66       * @return the file list.
67       * @throws TransferFailedException if there was a problem fetching the raw html.
68       */
69      public static List<String> parseFileList( String baseurl, InputStream stream )
70          throws TransferFailedException
71      {
72          try
73          {
74              URI baseURI = new URI( baseurl );
75              // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
76              // assumption.
77              String content = IOUtils.toString( stream, "utf-8" );
78              Document doc = Jsoup.parse( content, baseurl );
79              Elements links = doc.select("a[href]");
80              Set<String> results = new HashSet<String>();
81              for ( Element link : links )
82              {
83                  /*
84                   * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
85                   */
86                  String target = link.attr( "href" );
87                  if ( target != null )
88                  {
89                      String clean = cleanLink( baseURI, target );
90                      if ( isAcceptableLink( clean ) )
91                      {
92                          results.add( clean );
93                      }
94                  }
95  
96              }
97  
98              return new ArrayList<String>( results );
99          }
100         catch ( URISyntaxException e )
101         {
102             throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
103         }
104         catch ( IOException e )
105         {
106             throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
107         }
108     }
109 
110     private static String cleanLink( URI baseURI, String link )
111     {
112         if ( StringUtils.isEmpty( link ) )
113         {
114             return "";
115         }
116 
117         String ret = link;
118 
119         try
120         {
121             URI linkuri = new URI( ret );
122             if ( link.startsWith( "/" ) )
123             {
124                 linkuri = baseURI.resolve( linkuri );
125             }
126             URI relativeURI = baseURI.relativize( linkuri ).normalize();
127             ret = relativeURI.toASCIIString();
128             if ( ret.startsWith( baseURI.getPath() ) )
129             {
130                 ret = ret.substring( baseURI.getPath().length() );
131             }
132 
133             ret = URLDecoder.decode( ret, "UTF-8" );
134         }
135         catch ( URISyntaxException e )
136         {
137         }
138         catch ( UnsupportedEncodingException e )
139         {
140         }
141 
142         return ret;
143     }
144 
145     private static boolean isAcceptableLink( String link )
146     {
147         if ( StringUtils.isEmpty( link ) )
148         {
149             return false;
150         }
151 
152         for ( Pattern pattern : SKIPS )
153         {
154             if ( pattern.matcher( link ).find() )
155             {
156                 return false;
157             }
158         }
159 
160         return true;
161     }
162 
163 }