1 package org.apache.maven.wagon.shared.http4;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import org.apache.commons.io.IOUtils;
23 import org.apache.maven.wagon.TransferFailedException;
24 import org.codehaus.plexus.util.StringUtils;
25 import org.jsoup.Jsoup;
26 import org.jsoup.nodes.Document;
27 import org.jsoup.nodes.Element;
28 import org.jsoup.select.Elements;
29
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.UnsupportedEncodingException;
33 import java.net.URI;
34 import java.net.URISyntaxException;
35 import java.net.URLDecoder;
36 import java.util.ArrayList;
37 import java.util.HashSet;
38 import java.util.List;
39 import java.util.Set;
40 import java.util.regex.Pattern;
41
42
43
44
45 public class HtmlFileListParser
46 {
47
48 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
49
50
51 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
52
53
54 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
55
56
57 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
58
59 private static final Pattern[] SKIPS =
60 new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
61
62
63
64
65
66
67
68
69 public static List<String> parseFileList( String baseurl, InputStream stream )
70 throws TransferFailedException
71 {
72 try
73 {
74 URI baseURI = new URI( baseurl );
75
76
77 String content = IOUtils.toString( stream, "utf-8" );
78 Document doc = Jsoup.parse( content, baseurl );
79 Elements links = doc.select("a[href]");
80 Set<String> results = new HashSet<String>();
81 for ( int lx = 0; lx < links.size(); lx++ )
82 {
83 Element link = links.get( lx );
84
85
86
87 String target = link.attr( "href" );
88 if ( target != null )
89 {
90 String clean = cleanLink( baseURI, target );
91 if ( isAcceptableLink( clean ) )
92 {
93 results.add( clean );
94 }
95 }
96
97 }
98
99 return new ArrayList<String>( results );
100 }
101 catch ( URISyntaxException e )
102 {
103 throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
104 }
105 catch ( IOException e )
106 {
107 throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
108 }
109 }
110
111 private static String cleanLink( URI baseURI, String link )
112 {
113 if ( StringUtils.isEmpty( link ) )
114 {
115 return "";
116 }
117
118 String ret = link;
119
120 try
121 {
122 URI linkuri = new URI( ret );
123 if ( link.startsWith( "/" ) )
124 {
125 linkuri = baseURI.resolve( linkuri );
126 }
127 URI relativeURI = baseURI.relativize( linkuri ).normalize();
128 ret = relativeURI.toASCIIString();
129 if ( ret.startsWith( baseURI.getPath() ) )
130 {
131 ret = ret.substring( baseURI.getPath().length() );
132 }
133
134 ret = URLDecoder.decode( ret, "UTF-8" );
135 }
136 catch ( URISyntaxException e )
137 {
138 }
139 catch ( UnsupportedEncodingException e )
140 {
141 }
142
143 return ret;
144 }
145
146 private static boolean isAcceptableLink( String link )
147 {
148 if ( StringUtils.isEmpty( link ) )
149 {
150 return false;
151 }
152
153 for ( int i = 0; i < SKIPS.length; i++ )
154 {
155 if ( SKIPS[i].matcher( link ).find() )
156 {
157 return false;
158 }
159 }
160
161 return true;
162 }
163
164 }