1 package org.apache.maven.wagon.shared.http;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import org.apache.commons.io.IOUtils;
23 import org.apache.commons.lang.StringUtils;
24 import org.apache.maven.wagon.TransferFailedException;
25 import org.jsoup.Jsoup;
26 import org.jsoup.nodes.Document;
27 import org.jsoup.nodes.Element;
28 import org.jsoup.select.Elements;
29
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.UnsupportedEncodingException;
33 import java.net.URI;
34 import java.net.URISyntaxException;
35 import java.net.URLDecoder;
36 import java.util.ArrayList;
37 import java.util.HashSet;
38 import java.util.List;
39 import java.util.Set;
40 import java.util.regex.Pattern;
41
42
43
44
45 public class HtmlFileListParser
46 {
47
48 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
49
50
51 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
52
53
54 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
55
56
57 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
58
59 private static final Pattern[] SKIPS =
60 new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
61
62
63
64
65
66
67
68
69 public static List<String> parseFileList( String baseurl, InputStream stream )
70 throws TransferFailedException
71 {
72 try
73 {
74 URI baseURI = new URI( baseurl );
75
76
77 String content = IOUtils.toString( stream, "utf-8" );
78 Document doc = Jsoup.parse( content, baseurl );
79 Elements links = doc.select("a[href]");
80 Set<String> results = new HashSet<String>();
81 for ( Element link : links )
82 {
83
84
85
86 String target = link.attr( "href" );
87 if ( target != null )
88 {
89 String clean = cleanLink( baseURI, target );
90 if ( isAcceptableLink( clean ) )
91 {
92 results.add( clean );
93 }
94 }
95
96 }
97
98 return new ArrayList<String>( results );
99 }
100 catch ( URISyntaxException e )
101 {
102 throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
103 }
104 catch ( IOException e )
105 {
106 throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
107 }
108 }
109
110 private static String cleanLink( URI baseURI, String link )
111 {
112 if ( StringUtils.isEmpty( link ) )
113 {
114 return "";
115 }
116
117 String ret = link;
118
119 try
120 {
121 URI linkuri = new URI( ret );
122 if ( link.startsWith( "/" ) )
123 {
124 linkuri = baseURI.resolve( linkuri );
125 }
126 URI relativeURI = baseURI.relativize( linkuri ).normalize();
127 ret = relativeURI.toASCIIString();
128 if ( ret.startsWith( baseURI.getPath() ) )
129 {
130 ret = ret.substring( baseURI.getPath().length() );
131 }
132
133 ret = URLDecoder.decode( ret, "UTF-8" );
134 }
135 catch ( URISyntaxException e )
136 {
137 }
138 catch ( UnsupportedEncodingException e )
139 {
140 }
141
142 return ret;
143 }
144
145 private static boolean isAcceptableLink( String link )
146 {
147 if ( StringUtils.isEmpty( link ) )
148 {
149 return false;
150 }
151
152 for ( Pattern pattern : SKIPS )
153 {
154 if ( pattern.matcher( link ).find() )
155 {
156 return false;
157 }
158 }
159
160 return true;
161 }
162
163 }