001package org.apache.maven.wagon.shared.http;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import org.apache.maven.wagon.TransferFailedException;
023import org.apache.xerces.xni.Augmentations;
024import org.apache.xerces.xni.QName;
025import org.apache.xerces.xni.XMLAttributes;
026import org.apache.xerces.xni.parser.XMLInputSource;
027import org.apache.xerces.xni.parser.XMLParserConfiguration;
028import org.codehaus.plexus.util.StringUtils;
029import org.cyberneko.html.HTMLConfiguration;
030import org.cyberneko.html.filters.DefaultFilter;
031
032import java.io.IOException;
033import java.io.InputStream;
034import java.io.UnsupportedEncodingException;
035import java.net.URI;
036import java.net.URISyntaxException;
037import java.net.URLDecoder;
038import java.util.ArrayList;
039import java.util.HashSet;
040import java.util.List;
041import java.util.Set;
042import java.util.regex.Pattern;
043
044/**
045 * Html File List Parser.
046 */
047public class HtmlFileListParser
048{
049    /**
050     * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
051     *
052     * @return the file list.
053     * @throws TransferFailedException if there was a problem fetching the raw html.
054     */
055    public static List<String> parseFileList( String baseurl, InputStream stream )
056        throws TransferFailedException
057    {
058        try
059        {
060            // Use URI object to get benefits of proper absolute and relative path resolution for free
061            URI baseURI = new URI( baseurl );
062
063            Parser handler = new Parser( baseURI );
064
065            XMLParserConfiguration parser = new HTMLConfiguration();
066            parser.setDocumentHandler( handler );
067            parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
068            parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
069            parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
070            parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
071
072            return new ArrayList<String>( handler.getLinks() );
073
074        }
075        catch ( URISyntaxException e )
076        {
077            throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e );
078        }
079        catch ( IOException e )
080        {
081            throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
082        }
083    }
084
085    private static class Parser
086        extends DefaultFilter
087    {
088        // Apache Fancy Index Sort Headers
089        private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
090
091        // URLs with excessive paths.
092        private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
093
094        // URLs that to a parent directory.
095        private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
096
097        // mailto urls
098        private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
099
100        private static final Pattern[] SKIPS =
101            new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
102
103        private Set<String> links = new HashSet<String>();
104
105        private URI baseURI;
106
107        public Parser( URI baseURI )
108        {
109            this.baseURI = baseURI.normalize();
110        }
111
112        public Set<String> getLinks()
113        {
114            return links;
115        }
116
117        public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
118        {
119            if ( "A".equals( element.rawname ) )
120            {
121                String href = attrs.getValue( "HREF" );
122                if ( href != null )
123                {
124                    String link = cleanLink( baseURI, href );
125                    if ( isAcceptableLink( link ) )
126                    {
127                        links.add( link );
128                    }
129                }
130            }
131        }
132
133        private static String cleanLink( URI baseURI, String link )
134        {
135            if ( StringUtils.isEmpty( link ) )
136            {
137                return "";
138            }
139
140            String ret = link;
141
142            try
143            {
144                URI linkuri = new URI( ret );
145                URI relativeURI = baseURI.relativize( linkuri ).normalize();
146                ret = relativeURI.toASCIIString();
147                if ( ret.startsWith( baseURI.getPath() ) )
148                {
149                    ret = ret.substring( baseURI.getPath().length() );
150                }
151
152                ret = URLDecoder.decode( ret, "UTF-8" );
153            }
154            catch ( URISyntaxException e )
155            {
156            }
157            catch ( UnsupportedEncodingException e )
158            {
159            }
160
161            return ret;
162        }
163
164        private static boolean isAcceptableLink( String link )
165        {
166            if ( StringUtils.isEmpty( link ) )
167            {
168                return false;
169            }
170
171            for ( int i = 0; i < SKIPS.length; i++ )
172            {
173                if ( SKIPS[i].matcher( link ).find() )
174                {
175                    return false;
176                }
177            }
178
179            return true;
180        }
181    }
182}