View Javadoc

1   package org.apache.maven.wagon.shared.http;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.maven.wagon.TransferFailedException;
23  import org.apache.xerces.xni.Augmentations;
24  import org.apache.xerces.xni.QName;
25  import org.apache.xerces.xni.XMLAttributes;
26  import org.apache.xerces.xni.parser.XMLInputSource;
27  import org.apache.xerces.xni.parser.XMLParserConfiguration;
28  import org.codehaus.plexus.util.StringUtils;
29  import org.cyberneko.html.HTMLConfiguration;
30  import org.cyberneko.html.filters.DefaultFilter;
31  
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.io.UnsupportedEncodingException;
35  import java.net.URI;
36  import java.net.URISyntaxException;
37  import java.net.URLDecoder;
38  import java.util.ArrayList;
39  import java.util.HashSet;
40  import java.util.List;
41  import java.util.Set;
42  import java.util.regex.Pattern;
43  
44  /**
45   * Html File List Parser.
46   */
47  public class HtmlFileListParser
48  {
49      /**
50       * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
51       *
52       * @return the file list.
53       * @throws TransferFailedException if there was a problem fetching the raw html.
54       */
55      public static List<String> parseFileList( String baseurl, InputStream stream )
56          throws TransferFailedException
57      {
58          try
59          {
60              // Use URI object to get benefits of proper absolute and relative path resolution for free
61              URI baseURI = new URI( baseurl );
62  
63              Parser handler = new Parser( baseURI );
64  
65              XMLParserConfiguration parser = new HTMLConfiguration();
66              parser.setDocumentHandler( handler );
67              parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
68              parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
69              parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
70              parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
71  
72              return new ArrayList<String>( handler.getLinks() );
73  
74          }
75          catch ( URISyntaxException e )
76          {
77              throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e );
78          }
79          catch ( IOException e )
80          {
81              throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
82          }
83      }
84  
85      private static class Parser
86          extends DefaultFilter
87      {
88          // Apache Fancy Index Sort Headers
89          private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
90  
91          // URLs with excessive paths.
92          private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
93  
94          // URLs that to a parent directory.
95          private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
96  
97          // mailto urls
98          private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
99  
100         private static final Pattern[] SKIPS =
101             new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
102 
103         private Set<String> links = new HashSet<String>();
104 
105         private URI baseURI;
106 
107         public Parser( URI baseURI )
108         {
109             this.baseURI = baseURI.normalize();
110         }
111 
112         public Set<String> getLinks()
113         {
114             return links;
115         }
116 
117         public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
118         {
119             if ( "A".equals( element.rawname ) )
120             {
121                 String href = attrs.getValue( "HREF" );
122                 if ( href != null )
123                 {
124                     String link = cleanLink( baseURI, href );
125                     if ( isAcceptableLink( link ) )
126                     {
127                         links.add( link );
128                     }
129                 }
130             }
131         }
132 
133         private static String cleanLink( URI baseURI, String link )
134         {
135             if ( StringUtils.isEmpty( link ) )
136             {
137                 return "";
138             }
139 
140             String ret = link;
141 
142             try
143             {
144                 URI linkuri = new URI( ret );
145                 URI relativeURI = baseURI.relativize( linkuri ).normalize();
146                 ret = relativeURI.toASCIIString();
147                 if ( ret.startsWith( baseURI.getPath() ) )
148                 {
149                     ret = ret.substring( baseURI.getPath().length() );
150                 }
151 
152                 ret = URLDecoder.decode( ret, "UTF-8" );
153             }
154             catch ( URISyntaxException e )
155             {
156             }
157             catch ( UnsupportedEncodingException e )
158             {
159             }
160 
161             return ret;
162         }
163 
164         private static boolean isAcceptableLink( String link )
165         {
166             if ( StringUtils.isEmpty( link ) )
167             {
168                 return false;
169             }
170 
171             for ( int i = 0; i < SKIPS.length; i++ )
172             {
173                 if ( SKIPS[i].matcher( link ).find() )
174                 {
175                     return false;
176                 }
177             }
178 
179             return true;
180         }
181     }
182 }