View Javadoc
1   /*
2    * Copyright (c) 2002-2022, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.util.mail;
35  
36  import fr.paris.lutece.portal.service.util.AppLogService;
37  
38  import org.w3c.dom.Document;
39  import org.w3c.dom.NamedNodeMap;
40  import org.w3c.dom.Node;
41  import org.w3c.dom.NodeList;
42  
43  import org.w3c.tidy.Tidy;
44  
45  import java.io.ByteArrayInputStream;
46  
47  import java.net.MalformedURLException;
48  import java.net.URL;
49  
50  import java.util.ArrayList;
51  import java.util.HashMap;
52  import java.util.List;
53  import java.util.Map;
54  
55  import javax.activation.DataHandler;
56  
57  /**
58   * This classes provides implementation to retrieve urls from specified tags on an HTML page.
59   */
60  public class HtmlDocument
61  {
62      // Definition of some basic html elements
63      /**
64       * To define a CSS, html element must have:
65       * <ul>
66       * <li>"link" tag name</li>
67       * <li>"rel" attribute equal to "stylesheet"</li>
68       * </ul>
69       * The url is contained in the attributed named "href"
70       */
71      public static final ElementUrl ELEMENT_CSS;
72  
73      /**
74       * To define a javascript, html element must have:
75       * <ul>
76       * <li>"script" tag name</li>
77       * <li>"type" attribute equal to "text/javascript"</li>
78       * </ul>
79       * The url is contained in the attributed named "src"
80       */
81      public static final ElementUrl ELEMENT_JAVASCRIPT;
82  
83      /**
84       * To define an image, html element must have:
85       * <ul>
86       * <li>"img" tag name</li>
87       * </ul>
88       * The url is contained in the attributed named "src"
89       */
90      public static final ElementUrl ELEMENT_IMG;
91  
92      static
93      {
94          ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
95          ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
96          ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
97      }
98  
99      private Document _content;
100     private String _strBaseUrl;
101     private boolean _useAbsoluteUrl;
102 
103     /**
104      * Instanciates an HtmlDocument after having built the DOM tree.
105      *
106      * @param strHtml
107      *            The Html code to be parsed.
108      * @param strBaseUrl
109      *            The Base url used to retrieve urls.
110      * @param useAbsoluteUrl
111      *            Determine if we use absolute or relative url for HTML element's names
112      */
113     public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
114     {
115         // use of tidy to retrieve the DOM tree
116         Tidy tidy = new Tidy( );
117         tidy.setQuiet( true );
118         tidy.setShowWarnings( false );
119 
120         _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
121         _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
122         _useAbsoluteUrl = useAbsoluteUrl;
123     }
124 
125     /**
126      * Get the urls of all html elements specified by elementType
127      *
128      * @param elementType
129      *            the type of element to get
130      * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
131      */
132     public Map<String, URL> getAllUrls( ElementUrl elementType )
133     {
134         Map<String, URL> mapUrl = new HashMap<>( );
135 
136         NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
137 
138         for ( int i = 0; i < nodes.getLength( ); i++ )
139         {
140             Node node = nodes.item( i );
141             NamedNodeMap attributes = node.getAttributes( );
142 
143             // Test if the element matches the required attribute
144             if ( elementType.getTestedAttributeName( ) != null )
145             {
146                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
147 
148                 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
149                 {
150                     continue;
151                 }
152             }
153 
154             // Retrieve the url, then test if it matches the base url
155             String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
156 
157             if ( strSrc.startsWith( _strBaseUrl ) )
158             {
159                 try
160                 {
161                     URL url = new URL( strSrc );
162                     mapUrl.put( getUrlName( url ), url );
163                 }
164                 catch( MalformedURLException e )
165                 {
166                     // ignored document
167                     AppLogService.info( " {} not found, location ignored.", strSrc );
168                 }
169             }
170         }
171 
172         return mapUrl;
173     }
174 
175     /**
176      * Get the urls of all html elements specified by elementType
177      *
178      * @param elementType
179      *            the type of element to get
180      * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
181      */
182     public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
183     {
184         List<UrlAttachment> listUrlAttachement = new ArrayList<>( );
185         NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
186 
187         for ( int i = 0; i < nodes.getLength( ); i++ )
188         {
189             Node node = nodes.item( i );
190             NamedNodeMap attributes = node.getAttributes( );
191 
192             // Test if the element matches the required attribute
193             if ( elementType.getTestedAttributeName( ) != null )
194             {
195                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
196 
197                 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
198                 {
199                     continue;
200                 }
201             }
202 
203             // Retrieve the url, then test if it matches the base url
204             String strAttributeName = elementType.getAttributeName( );
205 
206             if ( ( strAttributeName != null ) && ( attributes != null ) )
207             {
208                 Node attributeNode = attributes.getNamedItem( strAttributeName );
209                 createAttributeUrl( attributeNode, listUrlAttachement );
210             }
211         }
212 
213         return listUrlAttachement;
214     }
215 
216     private void createAttributeUrl( Node attributeNode, List<UrlAttachment> listUrlAttachement )
217     {
218         if ( attributeNode != null )
219         {
220             String strSrc = attributeNode.getNodeValue( );
221 
222             if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
223             {
224                 try
225                 {
226                     URL url = new URL( strSrc );
227                     UrlAttachmenthtml#UrlAttachment">UrlAttachment urlAttachement = new UrlAttachment( getUrlName( url ), url );
228                     listUrlAttachement.add( urlAttachement );
229                 }
230                 catch( MalformedURLException e )
231                 {
232                     // ignored document
233                     AppLogService.info( " {} not found, location ignored.", strSrc );
234                 }
235             }
236         }
237     }
238 
239     /**
240      * Loads the url in a DataHandler
241      *
242      * @param url
243      *            an absolute url
244      * @return an Object containing the DataHandler
245      */
246     protected Object getUrlContent( URL url )
247     {
248         return new DataHandler( url );
249     }
250 
251     /**
252      * Return the absolute or relative url depending on _useAbsoluteUrl
253      * 
254      * @param url
255      *            an absolute url
256      * @return a String representing the url
257      */
258     protected String getUrlName( URL url )
259     {
260         return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
261     }
262 
263     /**
264      * provide a description for the HTML elements to be parsed
265      */
266     private static class ElementUrl
267     {
268         private String _strTagName;
269         private String _strAttributeName;
270         private String _strTestedAttributeName;
271         private String _strTestedAttributeValue;
272 
273         /**
274          * Instanciates an ElementUrl
275          *
276          * @param strTagName
277          *            the tag name to get (example: link, script, img, ...)
278          * @param strAttributeName
279          *            the attribute name to get (example: src, href, ...)
280          * @param strTestedAttributeName
281          *            the attribute name to test
282          * @param strTestedAttributeValue
283          *            the value of the attribute to test : if the value of the attribute strTestedAttributeName equals strTestedAttributeValue, then we get the
284          *            element's url, else we do nothing.
285          */
286         public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
287         {
288             _strTagName = strTagName;
289             _strAttributeName = strAttributeName;
290             _strTestedAttributeName = strTestedAttributeName;
291             _strTestedAttributeValue = strTestedAttributeValue;
292         }
293 
294         /**
295          * Returns the attributeName
296          * 
297          * @return the attributeName
298          */
299         public String getAttributeName( )
300         {
301             return _strAttributeName;
302         }
303 
304         /**
305          * Returns the tagName
306          * 
307          * @return the tagName
308          */
309         public String getTagName( )
310         {
311             return _strTagName;
312         }
313 
314         /**
315          * Returns the testedAttributeName
316          * 
317          * @return the testedAttributeName
318          */
319         public String getTestedAttributeName( )
320         {
321             return _strTestedAttributeName;
322         }
323 
324         /**
325          * Returns the testedAttributeValue
326          * 
327          * @return the testedAttributeValue
328          */
329         public String getTestedAttributeValue( )
330         {
331             return _strTestedAttributeValue;
332         }
333     }
334 }