001package org.apache.maven.wagon.shared.http4; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import org.apache.commons.io.IOUtils; 023import org.apache.maven.wagon.TransferFailedException; 024import org.codehaus.plexus.util.StringUtils; 025import org.jsoup.Jsoup; 026import org.jsoup.nodes.Document; 027import org.jsoup.nodes.Element; 028import org.jsoup.select.Elements; 029 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.UnsupportedEncodingException; 033import java.net.URI; 034import java.net.URISyntaxException; 035import java.net.URLDecoder; 036import java.util.ArrayList; 037import java.util.HashSet; 038import java.util.List; 039import java.util.Set; 040import java.util.regex.Pattern; 041 042/** 043 * Html File List Parser. 044 */ 045public class HtmlFileListParser 046{ 047 // Apache Fancy Index Sort Headers 048 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); 049 050 // URLs with excessive paths. 051 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); 052 053 // URLs that to a parent directory. 054 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); 055 056 // mailto urls 057 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); 058 059 private static final Pattern[] SKIPS = 060 new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; 061 062 /** 063 * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list. 064 * 065 * @param stream the input stream. 066 * @return the file list. 067 * @throws TransferFailedException if there was a problem fetching the raw html. 068 */ 069 public static List<String> parseFileList( String baseurl, InputStream stream ) 070 throws TransferFailedException 071 { 072 try 073 { 074 URI baseURI = new URI( baseurl ); 075 // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe 076 // assumption. 077 String content = IOUtils.toString( stream, "utf-8" ); 078 Document doc = Jsoup.parse( content, baseurl ); 079 Elements links = doc.select("a[href]"); 080 Set<String> results = new HashSet<String>(); 081 for ( int lx = 0; lx < links.size(); lx++ ) 082 { 083 Element link = links.get( lx ); 084 /* 085 * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink 086 */ 087 String target = link.attr( "href" ); 088 if ( target != null ) 089 { 090 String clean = cleanLink( baseURI, target ); 091 if ( isAcceptableLink( clean ) ) 092 { 093 results.add( clean ); 094 } 095 } 096 097 } 098 099 return new ArrayList<String>( results ); 100 } 101 catch ( URISyntaxException e ) 102 { 103 throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e ); 104 } 105 catch ( IOException e ) 106 { 107 throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e ); 108 } 109 } 110 111 private static String cleanLink( URI baseURI, String link ) 112 { 113 if ( StringUtils.isEmpty( link ) ) 114 { 115 return ""; 116 } 117 118 String ret = link; 119 120 try 121 { 122 URI linkuri = new URI( ret ); 123 if ( link.startsWith( "/" ) ) 124 { 125 linkuri = baseURI.resolve( linkuri ); 126 } 127 URI relativeURI = baseURI.relativize( linkuri ).normalize(); 128 ret = relativeURI.toASCIIString(); 129 if ( ret.startsWith( baseURI.getPath() ) ) 130 { 131 ret = ret.substring( baseURI.getPath().length() ); 132 } 133 134 ret = URLDecoder.decode( ret, "UTF-8" ); 135 } 136 catch ( URISyntaxException e ) 137 { 138 } 139 catch ( UnsupportedEncodingException e ) 140 { 141 } 142 143 return ret; 144 } 145 146 private static boolean isAcceptableLink( String link ) 147 { 148 if ( StringUtils.isEmpty( link ) ) 149 { 150 return false; 151 } 152 153 for ( int i = 0; i < SKIPS.length; i++ ) 154 { 155 if ( SKIPS[i].matcher( link ).find() ) 156 { 157 return false; 158 } 159 } 160 161 return true; 162 } 163 164}