1 package org.apache.maven.plugin.linkcheck;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.FileReader;
24 import java.io.IOException;
25 import java.util.Set;
26 import java.util.TreeSet;
27 import java.util.regex.Matcher;
28 import java.util.regex.Pattern;
29
30 /**
31 * Link matcher. Reads the contents of a file and tries to match the following: <code>
32 * <a href=""....
33 * <link href=""....
34 * <img src=""....
35 * <script src=""....
36 * </code>
37 *
38 * @author <a href="mailto:mac@apache.org">Ignacio G. Mac Dowell </a>
39 */
40 class LinkMatcher
41 {
42
43 /**
44 * Regexp for link matching.
45 */
46 private final static Pattern p =
47 Pattern.compile( "<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?",
48 Pattern.CASE_INSENSITIVE );
49
50 /**
51 * No need to create a new object each time a file is processed. Just clear it.
52 */
53 private final static Set linkList = new TreeSet();
54
55 /**
56 * Reads a file and returns a StringBuffer with its contents.
57 *
58 * TODO: Check for encoding issues
59 *
60 * TODO: Better exception handling?
61 *
62 * @param file
63 * the file we are reading
64 * @return a StringBuffer with file's contents.
65 * @throws IOException
66 */
67 private static StringBuffer fileToStringBuffer( File file ) throws IOException
68 {
69 BufferedReader reader = null;
70 final StringBuffer pageBuffer = new StringBuffer();
71 try
72 {
73 reader = new BufferedReader( new FileReader( file ) );
74 String line;
75 while ( ( line = reader.readLine() ) != null )
76 {
77 pageBuffer.append( line );
78 }
79 }
80 finally
81 {
82 reader.close();
83 }
84 return pageBuffer;
85 }
86
87 /**
88 * Performs the actual matching.
89 *
90 * @param file
91 * the file to check
92 * @return a set with all links to check
93 * @throws IOException
94 */
95 static Set match( File file ) throws IOException
96 {
97 linkList.clear();
98 final Matcher m = p.matcher( fileToStringBuffer( file ) );
99 String link;
100 while ( m.find() )
101 {
102 link = m.group( 1 ).trim();
103 if ( link.length() < 1 )
104 {
105 continue;
106 }
107 else if ( link.toLowerCase().indexOf( "javascript" ) != -1 )
108 {
109 continue;
110 }
111
112
113
114 linkList.add( link );
115 }
116 return linkList;
117 }
118
119 }