1 /*
2 * Copyright (c) 2002-2014, Mairie de Paris
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice
10 * and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice
13 * and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
15 *
16 * 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * License 1.0
33 */
34 package fr.paris.lutece.util.mail;
35
36 import fr.paris.lutece.portal.service.util.AppLogService;
37
38 import org.w3c.dom.Document;
39 import org.w3c.dom.NamedNodeMap;
40 import org.w3c.dom.Node;
41 import org.w3c.dom.NodeList;
42
43 import org.w3c.tidy.Tidy;
44
45 import java.io.ByteArrayInputStream;
46
47 import java.net.MalformedURLException;
48 import java.net.URL;
49
50 import java.util.ArrayList;
51 import java.util.HashMap;
52 import java.util.List;
53 import java.util.Map;
54
55 import javax.activation.DataHandler;
56
57
58 /**
59 * This classes provides implementation to retrieve urls from specified tags
60 * on an HTML page.
61 */
62 public class HtmlDocument
63 {
64 // Definition of some basic html elements
65 /**
66 * To define a CSS, html element must have:
67 * <ul>
68 * <li>"link" tag name</li>
69 * <li>"rel" attribute equal to "stylesheet"</li>
70 * </ul>
71 * The url is contained in the attributed named "href"
72 */
73 public static final ElementUrl ELEMENT_CSS;
74
75 /**
76 * To define a javascript, html element must have:
77 * <ul>
78 * <li>"script" tag name</li>
79 * <li>"type" attribute equal to "text/javascript"</li>
80 * </ul>
81 * The url is contained in the attributed named "src"
82 */
83 public static final ElementUrl ELEMENT_JAVASCRIPT;
84
85 /**
86 * To define an image, html element must have:
87 * <ul>
88 * <li>"img" tag name</li>
89 * </ul>
90 * The url is contained in the attributed named "src"
91 */
92 public static final ElementUrl ELEMENT_IMG;
93
94 static
95 {
96 ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
97 ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
98 ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
99 }
100
101 private Document _content;
102 private String _strBaseUrl;
103 private boolean _useAbsoluteUrl;
104
105 /**
106 * Instanciates an HtmlDocument after having built the DOM tree.
107 *
108 * @param strHtml The Html code to be parsed.
109 * @param strBaseUrl The Base url used to retrieve urls.
110 * @param useAbsoluteUrl Determine if we use absolute or relative url for HTML element's names
111 */
112 public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
113 {
114 // use of tidy to retrieve the DOM tree
115 Tidy tidy = new Tidy( );
116 tidy.setQuiet( true );
117 tidy.setShowWarnings( false );
118
119 _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
120 _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
121 _useAbsoluteUrl = useAbsoluteUrl;
122 }
123
124 /**
125 * Get the urls of all html elements specified by elementType
126 *
127 * @param elementType the type of element to get
128 * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
129 */
130 public Map<String, URL> getAllUrls( ElementUrl elementType )
131 {
132 Map<String, URL> mapUrl = new HashMap<String, URL>( );
133
134 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
135
136 for ( int i = 0; i < nodes.getLength( ); i++ )
137 {
138 Node node = nodes.item( i );
139 NamedNodeMap attributes = node.getAttributes( );
140
141 // Test if the element matches the required attribute
142 if ( elementType.getTestedAttributeName( ) != null )
143 {
144 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
145
146 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
147 {
148 continue;
149 }
150 }
151
152 // Retrieve the url, then test if it matches the base url
153 String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
154
155 if ( strSrc.startsWith( _strBaseUrl ) )
156 {
157 try
158 {
159 URL url = new URL( strSrc );
160 mapUrl.put( getUrlName( url ), url );
161 }
162 catch ( MalformedURLException e )
163 {
164 // ignored document
165 AppLogService.info( strSrc + " not found, location ignored." );
166 }
167 }
168 }
169
170 return mapUrl;
171 }
172
173 /**
174 * Get the urls of all html elements specified by elementType
175 *
176 * @param elementType the type of element to get
177 * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
178 */
179 public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
180 {
181 List<UrlAttachment> listUrlAttachement = new ArrayList<UrlAttachment>( );
182 UrlAttachment urlAttachement;
183 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
184
185 for ( int i = 0; i < nodes.getLength( ); i++ )
186 {
187 Node node = nodes.item( i );
188 NamedNodeMap attributes = node.getAttributes( );
189
190 // Test if the element matches the required attribute
191 if ( elementType.getTestedAttributeName( ) != null )
192 {
193 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
194
195 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
196 {
197 continue;
198 }
199 }
200
201 // Retrieve the url, then test if it matches the base url
202 String strAttributeName = elementType.getAttributeName( );
203
204 if ( ( strAttributeName != null ) && ( attributes != null ) )
205 {
206 Node attributeNode = attributes.getNamedItem( strAttributeName );
207
208 if ( attributeNode != null )
209 {
210 String strSrc = attributeNode.getNodeValue( );
211
212 if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
213 {
214 try
215 {
216 URL url = new URL( strSrc );
217 urlAttachement = new UrlAttachment( getUrlName( url ), url );
218 listUrlAttachement.add( urlAttachement );
219 }
220 catch ( MalformedURLException e )
221 {
222 // ignored document
223 AppLogService.info( strSrc + " not found, location ignored." );
224 }
225 }
226 }
227 }
228 }
229
230 return listUrlAttachement;
231 }
232
233 /**
234 * Loads the url in a DataHandler
235 *
236 * @param url an absolute url
237 * @return an Object containing the DataHandler
238 */
239 protected Object getUrlContent( URL url )
240 {
241 return new DataHandler( url );
242 }
243
244 /**
245 * Return the absolute or relative url depending on _useAbsoluteUrl
246 * @param url an absolute url
247 * @return a String representing the url
248 */
249 protected String getUrlName( URL url )
250 {
251 return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
252 }
253
254 /**
255 * provide a description for the HTML elements to be parsed
256 */
257 private static class ElementUrl
258 {
259 private String _strTagName;
260 private String _strAttributeName;
261 private String _strTestedAttributeName;
262 private String _strTestedAttributeValue;
263
264 /**
265 * Instanciates an ElementUrl
266 *
267 * @param strTagName the tag name to get (example: link, script, img, ...)
268 * @param strAttributeName the attribute name to get (example: src, href, ...)
269 * @param strTestedAttributeName the attribute name to test
270 * @param strTestedAttributeValue the value of the attribute to test :
271 * if the value of the attribute strTestedAttributeName equals strTestedAttributeValue,
272 * then we get the element's url, else we do nothing.
273 */
274 public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName,
275 String strTestedAttributeValue )
276 {
277 _strTagName = strTagName;
278 _strAttributeName = strAttributeName;
279 _strTestedAttributeName = strTestedAttributeName;
280 _strTestedAttributeValue = strTestedAttributeValue;
281 }
282
283 /**
284 * Returns the attributeName
285 * @return the attributeName
286 */
287 public String getAttributeName( )
288 {
289 return _strAttributeName;
290 }
291
292 /**
293 * Returns the tagName
294 * @return the tagName
295 */
296 public String getTagName( )
297 {
298 return _strTagName;
299 }
300
301 /**
302 * Returns the testedAttributeName
303 * @return the testedAttributeName
304 */
305 public String getTestedAttributeName( )
306 {
307 return _strTestedAttributeName;
308 }
309
310 /**
311 * Returns the testedAttributeValue
312 * @return the testedAttributeValue
313 */
314 public String getTestedAttributeValue( )
315 {
316 return _strTestedAttributeValue;
317 }
318 }
319 }