View Javadoc

1   package org.apache.maven.plugin.linkcheck;
2   
3   /* ====================================================================
4    *   Licensed to the Apache Software Foundation (ASF) under one or more
5    *   contributor license agreements.  See the NOTICE file distributed with
6    *   this work for additional information regarding copyright ownership.
7    *   The ASF licenses this file to You under the Apache License, Version 2.0
8    *   (the "License"); you may not use this file except in compliance with
9    *   the License.  You may obtain a copy of the License at
10   *
11   *       http://www.apache.org/licenses/LICENSE-2.0
12   *
13   *   Unless required by applicable law or agreed to in writing, software
14   *   distributed under the License is distributed on an "AS IS" BASIS,
15   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   *   See the License for the specific language governing permissions and
17   *   limitations under the License.
18   * ====================================================================
19   */
20  
21  import java.io.BufferedReader;
22  import java.io.File;
23  import java.io.FileReader;
24  import java.io.IOException;
25  import java.util.Set;
26  import java.util.TreeSet;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  /**
31   * Link matcher. Reads the contents of a file and tries to match the following: <code>
32   * <a href=""....
33   * <link href=""....
34   * <img src=""....
35   * <script src=""....
36   * </code>
37   * 
38   * @author <a href="mailto:mac@apache.org">Ignacio G. Mac Dowell </a>
39   */
40  class LinkMatcher
41  {
42  
43      /**
44       * Regexp for link matching.
45       */
46      private final static Pattern p =
47          Pattern.compile( "<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?",
48                           Pattern.CASE_INSENSITIVE );
49  
50      /**
51       * No need to create a new object each time a file is processed. Just clear it.
52       */
53      private final static Set linkList = new TreeSet();
54  
55      /**
56       * Reads a file and returns a StringBuffer with its contents.
57       * 
58       * TODO: Check for encoding issues
59       * 
60       * TODO: Better exception handling?
61       * 
62       * @param file
63       *            the file we are reading
64       * @return a StringBuffer with file's contents.
65       * @throws IOException
66       */
67      private static StringBuffer fileToStringBuffer( File file ) throws IOException
68      {
69          BufferedReader reader = null;
70          final StringBuffer pageBuffer = new StringBuffer();
71          try
72          {
73              reader = new BufferedReader( new FileReader( file ) );
74              String line;
75              while ( ( line = reader.readLine() ) != null )
76              {
77                  pageBuffer.append( line );
78              }
79          }
80          finally
81          {
82              reader.close();
83          }
84          return pageBuffer;
85      }
86  
87      /**
88       * Performs the actual matching.
89       * 
90       * @param file
91       *            the file to check
92       * @return a set with all links to check
93       * @throws IOException
94       */
95      static Set match( File file ) throws IOException
96      {
97          linkList.clear();
98          final Matcher m = p.matcher( fileToStringBuffer( file ) );
99          String link;
100         while ( m.find() )
101         {
102             link = m.group( 1 ).trim();
103             if ( link.length() < 1 )
104             {
105                 continue;
106             }
107             else if ( link.toLowerCase().indexOf( "javascript" ) != -1 )
108             {
109                 continue;
110             }
111             // else if (link.toLowerCase().indexOf("mailto:") != -1) {
112             // continue;
113             // }
114             linkList.add( link );
115         }
116         return linkList;
117     }
118 
119 }