001package org.apache.maven.wagon.shared.http4;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import org.apache.commons.io.IOUtils;
023import org.apache.maven.wagon.TransferFailedException;
024import org.codehaus.plexus.util.StringUtils;
025import org.jsoup.Jsoup;
026import org.jsoup.nodes.Document;
027import org.jsoup.nodes.Element;
028import org.jsoup.select.Elements;
029
030import java.io.IOException;
031import java.io.InputStream;
032import java.io.UnsupportedEncodingException;
033import java.net.URI;
034import java.net.URISyntaxException;
035import java.net.URLDecoder;
036import java.util.ArrayList;
037import java.util.HashSet;
038import java.util.List;
039import java.util.Set;
040import java.util.regex.Pattern;
041
042/**
043 * Html File List Parser.
044 */
045public class HtmlFileListParser
046{
047    // Apache Fancy Index Sort Headers
048    private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
049
050    // URLs with excessive paths.
051    private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
052
053    // URLs that to a parent directory.
054    private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
055
056    // mailto urls
057    private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
058
059    private static final Pattern[] SKIPS =
060        new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
061
062    /**
063     * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
064     *
065     * @param stream the input stream.
066     * @return the file list.
067     * @throws TransferFailedException if there was a problem fetching the raw html.
068     */
069    public static List<String> parseFileList( String baseurl, InputStream stream )
070        throws TransferFailedException
071    {
072        try
073        {
074            URI baseURI = new URI( baseurl );
075            // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
076            // assumption.
077            String content = IOUtils.toString( stream, "utf-8" );
078            Document doc = Jsoup.parse( content, baseurl );
079            Elements links = doc.select("a[href]");
080            Set<String> results = new HashSet<String>();
081            for ( int lx = 0; lx < links.size(); lx++ )
082            {
083                Element link = links.get( lx );
084                /*
085                 * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
086                 */
087                String target = link.attr( "href" );
088                if ( target != null )
089                {
090                    String clean = cleanLink( baseURI, target );
091                    if ( isAcceptableLink( clean ) )
092                    {
093                        results.add( clean );
094                    }
095                }
096
097            }
098
099            return new ArrayList<String>( results );
100        }
101        catch ( URISyntaxException e )
102        {
103            throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
104        }
105        catch ( IOException e )
106        {
107            throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
108        }
109    }
110
111    private static String cleanLink( URI baseURI, String link )
112    {
113        if ( StringUtils.isEmpty( link ) )
114        {
115            return "";
116        }
117
118        String ret = link;
119
120        try
121        {
122            URI linkuri = new URI( ret );
123            if ( link.startsWith( "/" ) )
124            {
125                linkuri = baseURI.resolve( linkuri );
126            }
127            URI relativeURI = baseURI.relativize( linkuri ).normalize();
128            ret = relativeURI.toASCIIString();
129            if ( ret.startsWith( baseURI.getPath() ) )
130            {
131                ret = ret.substring( baseURI.getPath().length() );
132            }
133
134            ret = URLDecoder.decode( ret, "UTF-8" );
135        }
136        catch ( URISyntaxException e )
137        {
138        }
139        catch ( UnsupportedEncodingException e )
140        {
141        }
142
143        return ret;
144    }
145
146    private static boolean isAcceptableLink( String link )
147    {
148        if ( StringUtils.isEmpty( link ) )
149        {
150            return false;
151        }
152
153        for ( int i = 0; i < SKIPS.length; i++ )
154        {
155            if ( SKIPS[i].matcher( link ).find() )
156            {
157                return false;
158            }
159        }
160
161        return true;
162    }
163
164}