View Javadoc
1   /*
2    * Copyright (c) 2002-2021, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.newsletter.util;
35  
36  import fr.paris.lutece.portal.service.util.AppLogService;
37  import fr.paris.lutece.portal.service.util.AppPropertiesService;
38  
39  import java.io.ByteArrayInputStream;
40  import java.io.StringWriter;
41  import java.io.UnsupportedEncodingException;
42  
43  import javax.xml.transform.Transformer;
44  import javax.xml.transform.TransformerConfigurationException;
45  import javax.xml.transform.TransformerException;
46  import javax.xml.transform.TransformerFactory;
47  import javax.xml.transform.dom.DOMSource;
48  import javax.xml.transform.stream.StreamResult;
49  
50  import org.w3c.dom.Document;
51  import org.w3c.dom.NamedNodeMap;
52  import org.w3c.dom.Node;
53  import org.w3c.dom.NodeList;
54  import org.w3c.tidy.Tidy;
55  
56  /**
57   * This classes provides implementation to retrieve urls from specified tags on an HTML page.
58   */
59  public class HtmlDomDocNewsletter
60  {
61      public static final String CONSTANT_STATIC_URL = "https?://[^/]+/";
62      public static final String CONSTANT_PROTOCOL_DELIMITER = ":";
63  
64      // Definition of some basic html elements
65      /**
66       * To define a CSS, html element must have:
67       * <ul>
68       * <li>"link" tag name</li>
69       * <li>"rel" attribute equal to "stylesheet"</li>
70       * </ul>
71       * The url is contained in the attributed named "href"
72       */
73      public static final ElementUrl ELEMENT_CSS;
74  
75      /**
76       * To define a javascript, html element must have:
77       * <ul>
78       * <li>"script" tag name</li>
79       * <li>"type" attribute equal to "text/javascript"</li>
80       * </ul>
81       * The url is contained in the attributed named "src"
82       */
83      public static final ElementUrl ELEMENT_JAVASCRIPT;
84  
85      /**
86       * To define an image, html element must have:
87       * <ul>
88       * <li>"img" tag name</li>
89       * </ul>
90       * The url is contained in the attributed named "src"
91       */
92      public static final ElementUrl ELEMENT_IMG;
93  
94      /**
95       * To define a anchor, a element must have:
96       * <ul>
97       * <li>"a" tag name</li>
98       * </ul>
99       * The url is contained in the attributed named "href"
100      */
101     public static final ElementUrl ELEMENT_A;
102 
103     /**
104      * To define a form, form element must have:
105      * <ul>
106      * <li>"form" tag name</li>
107      * </ul>
108      * The url is contained in the attributed named "action"
109      */
110     public static final ElementUrl ELEMENT_FORM;
111 
112     private static final String PROPERTY_LUTECE_ENCODING = "lutece.encoding";
113 
114     static
115     {
116         ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
117         ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
118         ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
119         ELEMENT_A = new ElementUrl( "a", "href", null, null );
120         ELEMENT_FORM = new ElementUrl( "form", "action", null, null );
121     }
122 
123     private Document _content;
124     private String _strBaseUrl;
125 
126     /**
127      * Instantiates an HtmlDocument after having built the DOM tree.
128      * 
129      * @param strHtml
130      *            The Html code to be parsed.
131      * @param strBaseUrl
132      *            The Base url used to retrieve urls.
133      */
134     public HtmlDomDocNewsletter( String strHtml, String strBaseUrl )
135     {
136         // use of tidy to retrieve the DOM tree
137         Tidy tidy = new Tidy( );
138         tidy.setQuiet( true );
139         tidy.setShowWarnings( false );
140 
141         String strEncoding = null;
142 
143         try
144         {
145             strEncoding = AppPropertiesService.getProperty( PROPERTY_LUTECE_ENCODING );
146             tidy.setInputEncoding( strEncoding );
147             _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( strEncoding ) ), null );
148         }
149         catch( UnsupportedEncodingException e )
150         {
151             AppLogService.error( "Error when parsing Html document (Newsletter) : UnsupporterEncodingException (" + strEncoding + ")", e );
152         }
153 
154         _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
155     }
156 
157     /**
158      * Get the relatives urls of all html elements specified by elementType and convert its to absolutes urls
159      * 
160      * @param elementType
161      *            the type of element to get
162      */
163     public void convertAllRelativesUrls( ElementUrl elementType )
164     {
165         NodeList nodes = getDomDocument( ).getElementsByTagName( elementType.getTagName( ) );
166 
167         for ( int i = 0; i < nodes.getLength( ); i++ )
168         {
169             Node node = nodes.item( i );
170             NamedNodeMap attributes = node.getAttributes( );
171 
172             // Test if the element matches the required attribute
173             if ( elementType.getTestedAttributeName( ) != null )
174             {
175                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
176 
177                 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
178                 {
179                     continue;
180                 }
181             }
182 
183             // Retrieve the url, then test if it matches the base url
184             Node nodeAttribute = attributes.getNamedItem( elementType.getAttributeName( ) );
185 
186             if ( nodeAttribute != null )
187             {
188                 String strSrc = nodeAttribute.getNodeValue( );
189 
190                 if ( !strSrc.matches( CONSTANT_STATIC_URL ) && !strSrc.contains( CONSTANT_PROTOCOL_DELIMITER ) )
191                 {
192                     nodeAttribute.setNodeValue( getBaseUrl( ) + strSrc );
193                 }
194             }
195         }
196     }
197 
198     /**
199      * Get the document content
200      * 
201      * @return The String content
202      */
203     public String getContent( )
204     {
205         DOMSource domSource = new DOMSource( _content );
206         StringWriter writer = new StringWriter( );
207         StreamResult result = new StreamResult( writer );
208         TransformerFactory tf = TransformerFactory.newInstance( );
209         Transformer transformer;
210 
211         try
212         {
213             transformer = tf.newTransformer( );
214             transformer.transform( domSource, result );
215         }
216         catch( TransformerConfigurationException e )
217         {
218             AppLogService.error( e.getMessage( ) );
219 
220             return null;
221         }
222         catch( TransformerException e )
223         {
224             AppLogService.error( e.getMessage( ) );
225 
226             return null;
227         }
228 
229         String stringResult = writer.toString( );
230 
231         return stringResult;
232     }
233 
234     /**
235      * Get the document used by this instance
236      * 
237      * @return The document used by this instance
238      */
239     protected org.w3c.dom.Document getDomDocument( )
240     {
241         return _content;
242     }
243 
244     /**
245      * Get the base url
246      * 
247      * @return The base url
248      */
249     protected String getBaseUrl( )
250     {
251         return _strBaseUrl;
252     }
253 
254     /**
255      * provide a description for the HTML elements to be parsed
256      */
257     protected static class ElementUrl
258     {
259         private String _strTagName;
260         private String _strAttributeName;
261         private String _strTestedAttributeName;
262         private String _strTestedAttributeValue;
263 
264         /**
265          * Instanciates an ElementUrl
266          * 
267          * @param strTagName
268          *            the tag name to get (example: link, script, img, ...)
269          * @param strAttributeName
270          *            the attribute name to get (example: src, href, ...)
271          * @param strTestedAttributeName
272          *            the attribute name to test
273          * @param strTestedAttributeValue
274          *            the value of the attribute to test : if the value of the attribute strTestedAttributeName equals strTestedAttributeValue, then we get the
275          *            element's url, else we do nothing.
276          */
277         public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
278         {
279             _strTagName = strTagName;
280             _strAttributeName = strAttributeName;
281             _strTestedAttributeName = strTestedAttributeName;
282             _strTestedAttributeValue = strTestedAttributeValue;
283         }
284 
285         /**
286          * Returns the attributeName
287          * 
288          * @return the attributeName
289          */
290         public String getAttributeName( )
291         {
292             return _strAttributeName;
293         }
294 
295         /**
296          * Returns the tagName
297          * 
298          * @return the tagName
299          */
300         public String getTagName( )
301         {
302             return _strTagName;
303         }
304 
305         /**
306          * Returns the testedAttributeName
307          * 
308          * @return the testedAttributeName
309          */
310         public String getTestedAttributeName( )
311         {
312             return _strTestedAttributeName;
313         }
314 
315         /**
316          * Returns the testedAttributeValue
317          * 
318          * @return the testedAttributeValue
319          */
320         public String getTestedAttributeValue( )
321         {
322             return _strTestedAttributeValue;
323         }
324     }
325 }