1 package org.apache.maven.wagon.shared.http;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import org.apache.commons.io.IOUtils;
23 import org.apache.maven.wagon.TransferFailedException;
24 import org.jsoup.Jsoup;
25 import org.jsoup.nodes.Document;
26 import org.jsoup.nodes.Element;
27 import org.jsoup.select.Elements;
28
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.io.UnsupportedEncodingException;
32 import java.net.URI;
33 import java.net.URISyntaxException;
34 import java.net.URLDecoder;
35 import java.util.ArrayList;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Set;
39 import java.util.regex.Pattern;
40
41
42
43
44 public class HtmlFileListParser
45 {
46
47 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
48
49
50 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
51
52
53 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
54
55
56 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
57
58 private static final Pattern[] SKIPS =
59 new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
60
61
62
63
64
65
66
67
68 public static List<String> parseFileList( String baseurl, InputStream stream )
69 throws TransferFailedException
70 {
71 try
72 {
73 URI baseURI = new URI( baseurl );
74
75
76 String content = IOUtils.toString( stream, "utf-8" );
77 Document doc = Jsoup.parse( content, baseurl );
78 Elements links = doc.select( "a[href]" );
79 Set<String> results = new HashSet<String>();
80 for ( Element link : links )
81 {
82
83
84
85 String target = link.attr( "href" );
86 if ( target != null )
87 {
88 String clean = cleanLink( baseURI, target );
89 if ( isAcceptableLink( clean ) )
90 {
91 results.add( clean );
92 }
93 }
94
95 }
96
97 return new ArrayList<String>( results );
98 }
99 catch ( URISyntaxException e )
100 {
101 throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e );
102 }
103 catch ( IOException e )
104 {
105 throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
106 }
107 }
108
109 private static String cleanLink( URI baseURI, String link )
110 {
111 if ( link == null || link.length() == 0 )
112 {
113 return "";
114 }
115
116 String ret = link;
117
118 try
119 {
120 URI linkuri = new URI( ret );
121 if ( link.startsWith( "/" ) )
122 {
123 linkuri = baseURI.resolve( linkuri );
124 }
125 URI relativeURI = baseURI.relativize( linkuri ).normalize();
126 ret = relativeURI.toASCIIString();
127 if ( ret.startsWith( baseURI.getPath() ) )
128 {
129 ret = ret.substring( baseURI.getPath().length() );
130 }
131
132 ret = URLDecoder.decode( ret, "UTF-8" );
133 }
134 catch ( URISyntaxException e )
135 {
136
137 }
138 catch ( UnsupportedEncodingException e )
139 {
140
141 }
142
143 return ret;
144 }
145
146 private static boolean isAcceptableLink( String link )
147 {
148 if ( link == null || link.length() == 0 )
149 {
150 return false;
151 }
152
153 for ( Pattern pattern : SKIPS )
154 {
155 if ( pattern.matcher( link ).find() )
156 {
157 return false;
158 }
159 }
160
161 return true;
162 }
163
164 }