1 /*
2 * Copyright (c) 2002-2025, City of Paris
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice
10 * and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice
13 * and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
15 *
16 * 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 * License 1.0
33 */
34 package fr.paris.lutece.util.mail;
35
36 import fr.paris.lutece.portal.service.util.AppLogService;
37
38 import org.w3c.dom.Document;
39 import org.w3c.dom.NamedNodeMap;
40 import org.w3c.dom.Node;
41 import org.w3c.dom.NodeList;
42
43 import org.w3c.tidy.Tidy;
44
45 import java.io.ByteArrayInputStream;
46
47 import java.net.MalformedURLException;
48 import java.net.URL;
49
50 import java.util.ArrayList;
51 import java.util.HashMap;
52 import java.util.List;
53 import java.util.Map;
54
55 import javax.activation.DataHandler;
56
57 /**
58 * This classes provides implementation to retrieve urls from specified tags on an HTML page.
59 */
60 public class HtmlDocument
61 {
62 // Definition of some basic html elements
63 /**
64 * To define a CSS, html element must have:
65 * <ul>
66 * <li>"link" tag name</li>
67 * <li>"rel" attribute equal to "stylesheet"</li>
68 * </ul>
69 * The url is contained in the attributed named "href"
70 */
71 public static final ElementUrl ELEMENT_CSS;
72
73 /**
74 * To define a javascript, html element must have:
75 * <ul>
76 * <li>"script" tag name</li>
77 * <li>"type" attribute equal to "text/javascript"</li>
78 * </ul>
79 * The url is contained in the attributed named "src"
80 */
81 public static final ElementUrl ELEMENT_JAVASCRIPT;
82
83 /**
84 * To define an image, html element must have:
85 * <ul>
86 * <li>"img" tag name</li>
87 * </ul>
88 * The url is contained in the attributed named "src"
89 */
90 public static final ElementUrl ELEMENT_IMG;
91
92 static
93 {
94 ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
95 ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
96 ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
97 }
98
99 private Document _content;
100 private String _strBaseUrl;
101 private boolean _useAbsoluteUrl;
102
103 /**
104 * Instanciates an HtmlDocument after having built the DOM tree.
105 *
106 * @param strHtml
107 * The Html code to be parsed.
108 * @param strBaseUrl
109 * The Base url used to retrieve urls.
110 * @param useAbsoluteUrl
111 * Determine if we use absolute or relative url for HTML element's names
112 */
113 public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
114 {
115 // use of tidy to retrieve the DOM tree
116 Tidy tidy = new Tidy( );
117 tidy.setQuiet( true );
118 tidy.setShowWarnings( false );
119
120 _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
121 _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
122 _useAbsoluteUrl = useAbsoluteUrl;
123 }
124
125 /**
126 * Get the urls of all html elements specified by elementType
127 *
128 * @param elementType
129 * the type of element to get
130 * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
131 */
132 public Map<String, URL> getAllUrls( ElementUrl elementType )
133 {
134 Map<String, URL> mapUrl = new HashMap<>( );
135
136 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
137
138 for ( int i = 0; i < nodes.getLength( ); i++ )
139 {
140 Node node = nodes.item( i );
141 NamedNodeMap attributes = node.getAttributes( );
142
143 // Test if the element matches the required attribute
144 if ( elementType.getTestedAttributeName( ) != null )
145 {
146 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
147
148 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
149 {
150 continue;
151 }
152 }
153
154 // Retrieve the url, then test if it matches the base url
155 String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
156
157 if ( strSrc.startsWith( _strBaseUrl ) )
158 {
159 try
160 {
161 URL url = new URL( strSrc );
162 mapUrl.put( getUrlName( url ), url );
163 }
164 catch( MalformedURLException e )
165 {
166 // ignored document
167 AppLogService.info( " {} not found, location ignored.", strSrc );
168 }
169 }
170 }
171
172 return mapUrl;
173 }
174
175 /**
176 * Get the urls of all html elements specified by elementType
177 *
178 * @param elementType
179 * the type of element to get
180 * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
181 */
182 public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
183 {
184 List<UrlAttachment> listUrlAttachement = new ArrayList<>( );
185 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
186
187 for ( int i = 0; i < nodes.getLength( ); i++ )
188 {
189 Node node = nodes.item( i );
190 NamedNodeMap attributes = node.getAttributes( );
191
192 // Test if the element matches the required attribute
193 if ( elementType.getTestedAttributeName( ) != null )
194 {
195 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
196
197 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
198 {
199 continue;
200 }
201 }
202
203 // Retrieve the url, then test if it matches the base url
204 String strAttributeName = elementType.getAttributeName( );
205
206 if ( ( strAttributeName != null ) && ( attributes != null ) )
207 {
208 Node attributeNode = attributes.getNamedItem( strAttributeName );
209 createAttributeUrl( attributeNode, listUrlAttachement );
210 }
211 }
212
213 return listUrlAttachement;
214 }
215
216 private void createAttributeUrl( Node attributeNode, List<UrlAttachment> listUrlAttachement )
217 {
218 if ( attributeNode != null )
219 {
220 String strSrc = attributeNode.getNodeValue( );
221
222 if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
223 {
224 try
225 {
226 URL url = new URL( strSrc );
227 UrlAttachmenthtml#UrlAttachment">UrlAttachment urlAttachement = new UrlAttachment( getUrlName( url ), url );
228 listUrlAttachement.add( urlAttachement );
229 }
230 catch( MalformedURLException e )
231 {
232 // ignored document
233 AppLogService.info( " {} not found, location ignored.", strSrc );
234 }
235 }
236 }
237 }
238
239 /**
240 * Loads the url in a DataHandler
241 *
242 * @param url
243 * an absolute url
244 * @return an Object containing the DataHandler
245 */
246 protected Object getUrlContent( URL url )
247 {
248 return new DataHandler( url );
249 }
250
251 /**
252 * Return the absolute or relative url depending on _useAbsoluteUrl
253 *
254 * @param url
255 * an absolute url
256 * @return a String representing the url
257 */
258 protected String getUrlName( URL url )
259 {
260 return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
261 }
262
263 /**
264 * provide a description for the HTML elements to be parsed
265 */
266 private static class ElementUrl
267 {
268 private String _strTagName;
269 private String _strAttributeName;
270 private String _strTestedAttributeName;
271 private String _strTestedAttributeValue;
272
273 /**
274 * Instanciates an ElementUrl
275 *
276 * @param strTagName
277 * the tag name to get (example: link, script, img, ...)
278 * @param strAttributeName
279 * the attribute name to get (example: src, href, ...)
280 * @param strTestedAttributeName
281 * the attribute name to test
282 * @param strTestedAttributeValue
283 * the value of the attribute to test : if the value of the attribute strTestedAttributeName equals strTestedAttributeValue, then we get the
284 * element's url, else we do nothing.
285 */
286 public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
287 {
288 _strTagName = strTagName;
289 _strAttributeName = strAttributeName;
290 _strTestedAttributeName = strTestedAttributeName;
291 _strTestedAttributeValue = strTestedAttributeValue;
292 }
293
294 /**
295 * Returns the attributeName
296 *
297 * @return the attributeName
298 */
299 public String getAttributeName( )
300 {
301 return _strAttributeName;
302 }
303
304 /**
305 * Returns the tagName
306 *
307 * @return the tagName
308 */
309 public String getTagName( )
310 {
311 return _strTagName;
312 }
313
314 /**
315 * Returns the testedAttributeName
316 *
317 * @return the testedAttributeName
318 */
319 public String getTestedAttributeName( )
320 {
321 return _strTestedAttributeName;
322 }
323
324 /**
325 * Returns the testedAttributeValue
326 *
327 * @return the testedAttributeValue
328 */
329 public String getTestedAttributeValue( )
330 {
331 return _strTestedAttributeValue;
332 }
333 }
334 }