001package org.apache.maven.wagon.shared.http; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import org.apache.maven.wagon.TransferFailedException; 023import org.apache.xerces.xni.Augmentations; 024import org.apache.xerces.xni.QName; 025import org.apache.xerces.xni.XMLAttributes; 026import org.apache.xerces.xni.parser.XMLInputSource; 027import org.apache.xerces.xni.parser.XMLParserConfiguration; 028import org.codehaus.plexus.util.StringUtils; 029import org.cyberneko.html.HTMLConfiguration; 030import org.cyberneko.html.filters.DefaultFilter; 031 032import java.io.IOException; 033import java.io.InputStream; 034import java.io.UnsupportedEncodingException; 035import java.net.URI; 036import java.net.URISyntaxException; 037import java.net.URLDecoder; 038import java.util.ArrayList; 039import java.util.HashSet; 040import java.util.List; 041import java.util.Set; 042import java.util.regex.Pattern; 043 044/** 045 * Html File List Parser. 046 */ 047public class HtmlFileListParser 048{ 049 /** 050 * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list. 051 * 052 * @return the file list. 053 * @throws TransferFailedException if there was a problem fetching the raw html. 054 */ 055 public static List<String> parseFileList( String baseurl, InputStream stream ) 056 throws TransferFailedException 057 { 058 try 059 { 060 // Use URI object to get benefits of proper absolute and relative path resolution for free 061 URI baseURI = new URI( baseurl ); 062 063 Parser handler = new Parser( baseURI ); 064 065 XMLParserConfiguration parser = new HTMLConfiguration(); 066 parser.setDocumentHandler( handler ); 067 parser.setFeature( "http://cyberneko.org/html/features/augmentations", true ); 068 parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" ); 069 parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" ); 070 parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) ); 071 072 return new ArrayList<String>( handler.getLinks() ); 073 074 } 075 catch ( URISyntaxException e ) 076 { 077 throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e ); 078 } 079 catch ( IOException e ) 080 { 081 throw new TransferFailedException( "I/O error: " + e.getMessage(), e ); 082 } 083 } 084 085 private static class Parser 086 extends DefaultFilter 087 { 088 // Apache Fancy Index Sort Headers 089 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); 090 091 // URLs with excessive paths. 092 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); 093 094 // URLs that to a parent directory. 095 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); 096 097 // mailto urls 098 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); 099 100 private static final Pattern[] SKIPS = 101 new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; 102 103 private Set<String> links = new HashSet<String>(); 104 105 private URI baseURI; 106 107 public Parser( URI baseURI ) 108 { 109 this.baseURI = baseURI.normalize(); 110 } 111 112 public Set<String> getLinks() 113 { 114 return links; 115 } 116 117 public void startElement( QName element, XMLAttributes attrs, Augmentations augs ) 118 { 119 if ( "A".equals( element.rawname ) ) 120 { 121 String href = attrs.getValue( "HREF" ); 122 if ( href != null ) 123 { 124 String link = cleanLink( baseURI, href ); 125 if ( isAcceptableLink( link ) ) 126 { 127 links.add( link ); 128 } 129 } 130 } 131 } 132 133 private static String cleanLink( URI baseURI, String link ) 134 { 135 if ( StringUtils.isEmpty( link ) ) 136 { 137 return ""; 138 } 139 140 String ret = link; 141 142 try 143 { 144 URI linkuri = new URI( ret ); 145 URI relativeURI = baseURI.relativize( linkuri ).normalize(); 146 ret = relativeURI.toASCIIString(); 147 if ( ret.startsWith( baseURI.getPath() ) ) 148 { 149 ret = ret.substring( baseURI.getPath().length() ); 150 } 151 152 ret = URLDecoder.decode( ret, "UTF-8" ); 153 } 154 catch ( URISyntaxException e ) 155 { 156 } 157 catch ( UnsupportedEncodingException e ) 158 { 159 } 160 161 return ret; 162 } 163 164 private static boolean isAcceptableLink( String link ) 165 { 166 if ( StringUtils.isEmpty( link ) ) 167 { 168 return false; 169 } 170 171 for ( int i = 0; i < SKIPS.length; i++ ) 172 { 173 if ( SKIPS[i].matcher( link ).find() ) 174 { 175 return false; 176 } 177 } 178 179 return true; 180 } 181 } 182}