View Javadoc
1   /*
2    * Copyright (c) 2002-2014, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.util.mail;
35  
36  import fr.paris.lutece.portal.service.util.AppLogService;
37  
38  import org.w3c.dom.Document;
39  import org.w3c.dom.NamedNodeMap;
40  import org.w3c.dom.Node;
41  import org.w3c.dom.NodeList;
42  
43  import org.w3c.tidy.Tidy;
44  
45  import java.io.ByteArrayInputStream;
46  
47  import java.net.MalformedURLException;
48  import java.net.URL;
49  
50  import java.util.ArrayList;
51  import java.util.HashMap;
52  import java.util.List;
53  import java.util.Map;
54  
55  import javax.activation.DataHandler;
56  
57  
58  /**
59   * This classes provides implementation to retrieve urls from specified tags
60   * on an HTML page.
61   */
62  public class HtmlDocument
63  {
64      // Definition of some basic html elements
65      /**
66       *  To define a CSS, html element must have:
67       *  <ul>
68       *  <li>"link" tag name</li>
69       *  <li>"rel" attribute equal to "stylesheet"</li>
70       *  </ul>
71       *  The url is contained in the attributed named "href"
72       */
73      public static final ElementUrl ELEMENT_CSS;
74  
75      /**
76       *  To define a javascript, html element must have:
77       *  <ul>
78       *  <li>"script" tag name</li>
79       *  <li>"type" attribute equal to "text/javascript"</li>
80       *  </ul>
81       *  The url is contained in the attributed named "src"
82       */
83      public static final ElementUrl ELEMENT_JAVASCRIPT;
84  
85      /**
86           *  To define an image, html element must have:
87           *  <ul>
88           *  <li>"img" tag name</li>
89           *  </ul>
90           *  The url is contained in the attributed named "src"
91           */
92      public static final ElementUrl ELEMENT_IMG;
93  
94      static
95      {
96          ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
97          ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
98          ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
99      }
100 
101     private Document _content;
102     private String _strBaseUrl;
103     private boolean _useAbsoluteUrl;
104 
105     /**
106      * Instanciates an HtmlDocument after having built the DOM tree.
107      *
108      * @param strHtml The Html code to be parsed.
109      * @param strBaseUrl The Base url used to retrieve urls.
110      * @param useAbsoluteUrl Determine if we use absolute or relative url for HTML element's names
111      */
112     public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
113     {
114         // use of tidy to retrieve the DOM tree
115         Tidy tidy = new Tidy(  );
116         tidy.setQuiet( true );
117         tidy.setShowWarnings( false );
118 
119         _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes(  ) ), null );
120         _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
121         _useAbsoluteUrl = useAbsoluteUrl;
122     }
123 
124     /**
125      * Get the urls of all html elements specified by elementType
126      *
127      * @param elementType the type of element to get
128      * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
129      */
130     public Map<String, URL> getAllUrls( ElementUrl elementType )
131     {
132         Map<String, URL> mapUrl = new HashMap<String, URL>(  );
133 
134         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
135 
136         for ( int i = 0; i < nodes.getLength(  ); i++ )
137         {
138             Node node = nodes.item( i );
139             NamedNodeMap attributes = node.getAttributes(  );
140 
141             // Test if the element matches the required attribute
142             if ( elementType.getTestedAttributeName(  ) != null )
143             {
144                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName(  ) ).getNodeValue(  );
145 
146                 if ( !elementType.getTestedAttributeValue(  ).equals( strRel ) )
147                 {
148                     continue;
149                 }
150             }
151 
152             // Retrieve the url, then test if it matches the base url
153             String strSrc = attributes.getNamedItem( elementType.getAttributeName(  ) ).getNodeValue(  );
154 
155             if ( strSrc.startsWith( _strBaseUrl ) )
156             {
157                 try
158                 {
159                     URL url = new URL( strSrc );
160                     mapUrl.put( getUrlName( url ), url );
161                 }
162                 catch ( MalformedURLException e )
163                 {
164                     // ignored document
165                     AppLogService.info( strSrc + " not found, location ignored." );
166                 }
167             }
168         }
169 
170         return mapUrl;
171     }
172 
173     /**
174      * Get the urls of all html elements specified by elementType
175      *
176      * @param elementType the type of element to get
177      * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
178      */
179     public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
180     {
181         List<UrlAttachment> listUrlAttachement = new ArrayList<UrlAttachment>(  );
182         UrlAttachment urlAttachement;
183         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
184 
185         for ( int i = 0; i < nodes.getLength(  ); i++ )
186         {
187             Node node = nodes.item( i );
188             NamedNodeMap attributes = node.getAttributes(  );
189 
190             // Test if the element matches the required attribute
191             if ( elementType.getTestedAttributeName(  ) != null )
192             {
193                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName(  ) ).getNodeValue(  );
194 
195                 if ( !elementType.getTestedAttributeValue(  ).equals( strRel ) )
196                 {
197                     continue;
198                 }
199             }
200 
201             // Retrieve the url, then test if it matches the base url
202             String strAttributeName = elementType.getAttributeName(  );
203 
204             if ( ( strAttributeName != null ) && ( attributes != null ) )
205             {
206                 Node attributeNode = attributes.getNamedItem( strAttributeName );
207 
208                 if ( attributeNode != null )
209                 {
210                     String strSrc = attributeNode.getNodeValue(  );
211 
212                     if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
213                     {
214                         try
215                         {
216                             URL url = new URL( strSrc );
217                             urlAttachement = new UrlAttachment( getUrlName( url ), url );
218                             listUrlAttachement.add( urlAttachement );
219                         }
220                         catch ( MalformedURLException e )
221                         {
222                             // ignored document
223                             AppLogService.info( strSrc + " not found, location ignored." );
224                         }
225                     }
226                 }
227             }
228         }
229 
230         return listUrlAttachement;
231     }
232 
233     /**
234      * Loads the url in a DataHandler
235      *
236      * @param url an absolute url
237      * @return an Object containing the DataHandler
238      */
239     protected Object getUrlContent( URL url )
240     {
241         return new DataHandler( url );
242     }
243 
244     /**
245      * Return the absolute or relative url depending on _useAbsoluteUrl
246      * @param url an absolute url
247      * @return a String representing the url
248      */
249     protected String getUrlName( URL url )
250     {
251         return _useAbsoluteUrl ? url.toExternalForm(  ) : url.getPath(  );
252     }
253 
254     /**
255      * provide a description for the HTML elements to be parsed
256      */
257     private static class ElementUrl
258     {
259         private String _strTagName;
260         private String _strAttributeName;
261         private String _strTestedAttributeName;
262         private String _strTestedAttributeValue;
263 
264         /**
265          * Instanciates an ElementUrl
266          *
267          * @param strTagName the tag name to get (example: link, script, img, ...)
268          * @param strAttributeName the attribute name to get (example: src, href, ...)
269          * @param strTestedAttributeName the attribute name to test
270          * @param strTestedAttributeValue the value of the attribute to test :
271          * if the value of the attribute strTestedAttributeName equals strTestedAttributeValue,
272          * then we get the element's url, else we do nothing.
273          */
274         public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName,
275             String strTestedAttributeValue )
276         {
277             _strTagName = strTagName;
278             _strAttributeName = strAttributeName;
279             _strTestedAttributeName = strTestedAttributeName;
280             _strTestedAttributeValue = strTestedAttributeValue;
281         }
282 
283         /**
284          * Returns the attributeName
285          * @return the attributeName
286          */
287         public String getAttributeName(  )
288         {
289             return _strAttributeName;
290         }
291 
292         /**
293          * Returns the tagName
294          * @return the tagName
295          */
296         public String getTagName(  )
297         {
298             return _strTagName;
299         }
300 
301         /**
302          * Returns the testedAttributeName
303          * @return the testedAttributeName
304          */
305         public String getTestedAttributeName(  )
306         {
307             return _strTestedAttributeName;
308         }
309 
310         /**
311          * Returns the testedAttributeValue
312          * @return the testedAttributeValue
313          */
314         public String getTestedAttributeValue(  )
315         {
316             return _strTestedAttributeValue;
317         }
318     }
319 }