View Javadoc
1   /*
2    * Copyright (c) 2002-2017, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.webappcontainer.util;
35  
36  import fr.paris.lutece.plugins.webappcontainer.business.Site;
37  import fr.paris.lutece.plugins.webappcontainer.service.WebappcontainerPlugin;
38  import fr.paris.lutece.plugins.webappcontainer.web.WebappcontainerApp;
39  import fr.paris.lutece.portal.service.util.AppLogService;
40  import fr.paris.lutece.portal.service.util.AppPathService;
41  import fr.paris.lutece.portal.service.util.AppPropertiesService;
42  
43  import org.w3c.dom.Document;
44  import org.w3c.dom.Element;
45  import org.w3c.dom.NamedNodeMap;
46  import org.w3c.dom.Node;
47  import org.w3c.dom.NodeList;
48  
49  import org.w3c.tidy.Tidy;
50  
51  import java.io.ByteArrayInputStream;
52  import java.io.ByteArrayOutputStream;
53  import java.io.PrintWriter;
54  import java.io.StringWriter;
55  
56  import javax.xml.transform.OutputKeys;
57  import javax.xml.transform.Transformer;
58  import javax.xml.transform.TransformerConfigurationException;
59  import javax.xml.transform.TransformerException;
60  import javax.xml.transform.TransformerFactory;
61  import javax.xml.transform.dom.DOMSource;
62  import javax.xml.transform.stream.StreamResult;
63  
64  
65  /**
66   * This classes provides implementation to retrieve urls from specified tags
67   * on an HTML page.
68   */
69  public class HtmlDocumentWebappcontainer
70  {
71      public static final String CONSTANT_STATIC_URL = "https?://[^/]+/";
72      public static final String CONSTANT_PROTOCOL_DELIMITER = ":";
73      private static final String PROPERTY_PARSING_STOP_WHEN_ERROR = "webappcontainer.jtidy.parsing.stopWhenError";
74      private static final String TAG_INPUT = "input";
75      private static final String TAG_INPUT_ATTRIBUTE_TYPE = "type";
76      private static final String TAG_INPUT_ATTRIBUTE_NAME = "name";
77      private static final String TAG_INPUT_ATTRIBUTE_VALUE = "value";
78      private static final String TAG_INPUT_ATTRIBUTE_TYPE_VALUE_HIDDEN = "hidden";
79      private static final String EMPTY_STRING = "";
80      private static final String OMIT_XML_DECLARATION_TRUE = "yes";
81      private static final String METHOD_HTML = "html";
82  
83      // Definition of some basic html elements
84      /**
85       *  To define a CSS link, html element must have:
86       *  <ul>
87       *  <li>"link" tag name</li>
88       *  <li>"rel" attribute equal to "stylesheet"</li>
89       *  </ul>
90       *  The url is contained in the attributed named "href"
91       */
92      public static final ElementUrl ELEMENT_CSS_LINK;
93  
94      /**
95       *  To define a CSS style, html element must have:
96       *  <ul>
97       *  <li>"style" tag name</li>
98       *  <li>"type" attribute equal to "text/css"</li>
99       *  </ul>
100      */
101     public static final ElementUrl ELEMENT_CSS_STYLE;
102 
103     /**
104      *  To define a RSS/XML, link element must have:
105      *  <ul>
106      *  <li>"link" tag name</li>
107      *  <li>"rel" attribute equal to "alternate"</li>
108      *  </ul>
109      *  The url is contained in the attributed named "href"
110      */
111     public static final ElementUrl ELEMENT_ALTERNATE;
112 
113     /**
114      *  To define a javascript, html element must have:
115      *  <ul>
116      *  <li>"script" tag name</li>
117      *  <li>"type" attribute equal to "text/javascript"</li>
118      *  </ul>
119      *  The url is contained in the attributed named "src"
120      */
121     public static final ElementUrl ELEMENT_JAVASCRIPT;
122 
123     /**
124          *  To define an image, html element must have:
125          *  <ul>
126          *  <li>"img" tag name</li>
127          *  </ul>
128          *  The url is contained in the attributed named "src"
129          */
130     public static final ElementUrl ELEMENT_IMG;
131 
132     /**
133      *  To define a anchor, a element must have:
134      *  <ul>
135      *  <li>"a" tag name</li>
136      *  </ul>
137      *  The url is contained in the attributed named "href"
138      */
139     public static final ElementUrl ELEMENT_A;
140 
141     /**
142      *  To define a form, form element must have:
143      *  <ul>
144      *  <li>"form" tag name</li>
145      *  </ul>
146      *  The url is contained in the attributed named "action"
147      */
148     public static final ElementUrl ELEMENT_FORM;
149 
150     /**
151      *  To define a head, head element must have:
152      *  <ul>
153      *  <li>"head" tag name</li>
154      *  </ul>
155      */
156     public static final ElementUrl ELEMENT_HEAD;
157 
158     /**
159      *  To define a body, body element must have:
160      *  <ul>
161      *  <li>"body" tag name</li>
162      *  </ul>
163      */
164     public static final ElementUrl ELEMENT_BODY;
165 
166     /**
167      *  To define a base, base element must have:
168      *  <ul>
169      *  <li>"base" tag name</li>
170      *  </ul>
171      */
172     public static final ElementUrl ELEMENT_BASE;
173 
174     /**
175      *  To define an input. input element must have:
176      *  <ul>
177      *  <li>"type" = "hidden" tag</li>
178      *  </ul>
179      */
180     public static final ElementUrl ELEMENT_INPUT;
181 
182     static
183     {
184         ELEMENT_CSS_LINK = new ElementUrl( "link", "href", "rel", "stylesheet" );
185         ELEMENT_CSS_STYLE = new ElementUrl( "style", null, "type", "text/css" );
186         ELEMENT_ALTERNATE = new ElementUrl( "link", "href", "rel", "alternate" );
187         ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
188         ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
189         ELEMENT_A = new ElementUrl( "a", "href", null, null );
190         ELEMENT_FORM = new ElementUrl( "form", "action", null, null );
191         ELEMENT_HEAD = new ElementUrl( "head", null, null, null );
192         ELEMENT_BODY = new ElementUrl( "body", null, null, null );
193         ELEMENT_BASE = new ElementUrl( "base", "href", null, null );
194         ELEMENT_INPUT = new ElementUrl( "input", null, null, null );
195     }
196 
197     private Document _content;
198 
199     /**
200      * Instantiates an HtmlDocumentWebappcontainer after having built the DOM tree.
201      *
202      * @param byteHtml The Html code to be parsed.
203      * @param strEncoding The encoding used to retrieve urls.
204      * @throws HtmlDocumentWebappcontainerException if errors when parsing HTML
205      */
206     public HtmlDocumentWebappcontainer( byte[] byteHtml, String strEncoding )
207         throws HtmlDocumentWebappcontainerException
208     {
209         // use of tidy to retrieve the DOM tree
210         Tidy tidy = new Tidy(  );
211         tidy.setQuiet( true );
212         tidy.setShowWarnings( false );
213 
214         ByteArrayOutputStream baErrors = new ByteArrayOutputStream(  );
215         PrintWriter pw = new PrintWriter( baErrors, true );
216         tidy.setErrout( pw );
217         tidy.setTidyMark( false );
218         tidy.setInputEncoding( strEncoding );
219         _content = tidy.parseDOM( new ByteArrayInputStream( byteHtml ), null );
220 
221         boolean bStopWhenError = Boolean.parseBoolean( AppPropertiesService.getProperty( 
222                     PROPERTY_PARSING_STOP_WHEN_ERROR, Boolean.toString( false ) ) );
223 
224         if ( bStopWhenError && !baErrors.toString(  ).equals( EMPTY_STRING ) )
225         {
226             throw new HtmlDocumentWebappcontainerException( baErrors.toString(  ), null );
227         }
228     }
229 
230     /**
231      * Get the urls of all html elements specified by elementType and convert its to absolutes urls
232      *
233      * @param elementType the type of element to get
234      * @param strBaseUrlSite The base url of the external site
235      * @param site The external site concerned by url conversion
236      * @param strReplaceUrl The prefix of the new url (ie : link to webappcontainer servlet or webappcontainer Xpage)
237      * @param bEncodeUrl true = encode (Base64) the url
238      *
239      */
240     public void convertUrls( ElementUrl elementType, String strBaseUrlSite, Site site, String strReplaceUrl,
241         boolean bEncodeUrl )
242     {
243         if ( elementType == ELEMENT_FORM )
244         {
245             changeInputs(  );
246         }
247 
248         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
249 
250         for ( int i = 0; i < nodes.getLength(  ); i++ )
251         {
252             Node node = nodes.item( i );
253             NamedNodeMap attributes = node.getAttributes(  );
254 
255             // Test if the element matches the required attribute
256             if ( ( elementType.getTestedAttributeName(  ) != null ) &&
257                     ( attributes.getNamedItem( elementType.getTestedAttributeName(  ) ) != null ) )
258             {
259                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName(  ) ).getNodeValue(  );
260 
261                 if ( !elementType.getTestedAttributeValue(  ).equalsIgnoreCase( strRel ) )
262                 {
263                     continue;
264                 }
265             }
266 
267             // Retrieve the url, then test if it matches the base url
268             Node nodeAttribute = attributes.getNamedItem( elementType.getAttributeName(  ) );
269 
270             if ( nodeAttribute == null )
271             {
272                 continue;
273             }
274 
275             String strSrc = nodeAttribute.getNodeValue(  ).trim(  );
276             String strAbsoluteUrl = UrlUtils.convertRelativeToAbsoluteUrl( strSrc, strBaseUrlSite );
277             boolean isExternalUrl = !UrlUtils.hostsEquals( strAbsoluteUrl, site.getUrl(  ) );
278 
279             if ( bEncodeUrl && !isExternalUrl )
280             {
281                 strAbsoluteUrl = UrlUtils.encodeUrl( strAbsoluteUrl );
282             }
283 
284             if ( elementType == ELEMENT_FORM )
285             {
286                 Element elementHiddenXPageName = _content.createElement( TAG_INPUT );
287                 elementHiddenXPageName.setAttribute( TAG_INPUT_ATTRIBUTE_TYPE, TAG_INPUT_ATTRIBUTE_TYPE_VALUE_HIDDEN );
288                 elementHiddenXPageName.setAttribute( TAG_INPUT_ATTRIBUTE_NAME, WebappcontainerApp.PARAMETER_PAGE );
289                 elementHiddenXPageName.setAttribute( TAG_INPUT_ATTRIBUTE_VALUE, WebappcontainerPlugin.PLUGIN_NAME );
290 
291                 node.appendChild( elementHiddenXPageName );
292 
293                 Element elementHiddenSiteCode = _content.createElement( TAG_INPUT );
294                 elementHiddenSiteCode.setAttribute( TAG_INPUT_ATTRIBUTE_TYPE, TAG_INPUT_ATTRIBUTE_TYPE_VALUE_HIDDEN );
295                 elementHiddenSiteCode.setAttribute( TAG_INPUT_ATTRIBUTE_NAME, WebappcontainerApp.PARAMETER_CODE );
296                 elementHiddenSiteCode.setAttribute( TAG_INPUT_ATTRIBUTE_VALUE, site.getCode(  ) );
297 
298                 node.appendChild( elementHiddenSiteCode );
299 
300                 Element elementHiddenWebappUrl = _content.createElement( TAG_INPUT );
301                 elementHiddenWebappUrl.setAttribute( TAG_INPUT_ATTRIBUTE_TYPE, TAG_INPUT_ATTRIBUTE_TYPE_VALUE_HIDDEN );
302                 elementHiddenWebappUrl.setAttribute( TAG_INPUT_ATTRIBUTE_NAME, WebappcontainerApp.PARAMETER_WEBAPP_URL );
303                 elementHiddenWebappUrl.setAttribute( TAG_INPUT_ATTRIBUTE_VALUE, strAbsoluteUrl );
304 
305                 node.appendChild( elementHiddenWebappUrl );
306 
307                 nodeAttribute.setNodeValue( AppPathService.getPortalUrl(  ) );
308             }
309             else
310             {
311                 // Concat the "webappcontainer" part of the url with the external site url (encoded)
312                 nodeAttribute.setNodeValue( ( !isExternalUrl ) ? ( strReplaceUrl + strAbsoluteUrl ) : strAbsoluteUrl );
313             }
314         }
315     }
316 
317     /**
318      * Get first element of the specified elementType
319      *
320      * @param elementType the type of element to get
321      * @return the content of the first element specified by the element type
322      */
323     public StringBuffer getFirstElement( ElementUrl elementType )
324     {
325         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
326 
327         if ( nodes.getLength(  ) == 0 )
328         {
329             return null;
330         }
331 
332         Node node = nodes.item( 0 );
333         NodeList childNodeList = node.getChildNodes(  );
334         StringBuffer stringBuffer = new StringBuffer(  );
335 
336         for ( int i = 0; i < childNodeList.getLength(  ); i++ )
337         {
338             stringBuffer.append( getNodeContent( childNodeList.item( i ) ) );
339         }
340 
341         return stringBuffer;
342     }
343 
344     /**
345      * Get all elements of the specified elementType
346      *
347      * @param elementType the type of element to get
348      * @return the content of all elements specified by the element type
349      */
350     public StringBuffer getElements( ElementUrl elementType )
351     {
352         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
353 
354         if ( nodes.getLength(  ) == 0 )
355         {
356             return null;
357         }
358 
359         StringBuffer stringBuffer = new StringBuffer(  );
360 
361         for ( int i = 0; i < nodes.getLength(  ); i++ )
362         {
363             stringBuffer.append( getNodeContent( nodes.item( i ) ) );
364         }
365 
366         return stringBuffer;
367     }
368 
369     /**
370      * Remove the first element of the specified elementType
371      *
372      * @param elementType the type of element to remove
373      */
374     public void removeFirstElement( ElementUrl elementType )
375     {
376         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
377 
378         if ( nodes.getLength(  ) != 0 )
379         {
380             Node node = nodes.item( 0 );
381             Node parentNode = node.getParentNode(  );
382             parentNode.removeChild( node );
383         }
384     }
385 
386     /**
387      * Get first element attribute of the specified elementType
388      *
389      * @param elementType the type of element to get
390      * @return the content of the first element attribute specified by the element type
391      */
392     public String getFirstElementAttribute( ElementUrl elementType )
393     {
394         NodeList nodes = _content.getElementsByTagName( elementType.getTagName(  ) );
395 
396         if ( nodes.getLength(  ) == 0 )
397         {
398             return null;
399         }
400 
401         Node node = nodes.item( 0 );
402         NamedNodeMap attributes = node.getAttributes(  );
403 
404         // Test if the element matches the required attribute
405         if ( elementType.getTestedAttributeName(  ) != null )
406         {
407             String strRel = attributes.getNamedItem( elementType.getTestedAttributeName(  ) ).getNodeValue(  );
408 
409             if ( !elementType.getTestedAttributeValue(  ).equals( strRel ) )
410             {
411                 return null;
412             }
413         }
414 
415         // Retrieve the url, then test if it matches the base url
416         Node nodeAttribute = attributes.getNamedItem( elementType.getAttributeName(  ) );
417 
418         if ( nodeAttribute == null )
419         {
420             return null;
421         }
422 
423         return nodeAttribute.getNodeValue(  );
424     }
425 
426     /**
427      * Get the document content
428      * @return The StringBuffer content
429      */
430     public StringBuffer getContent(  )
431     {
432         DOMSource domSource = new DOMSource( _content );
433         StringWriter writer = new StringWriter(  );
434         StreamResult result = new StreamResult( writer );
435         TransformerFactory tf = TransformerFactory.newInstance(  );
436         Transformer transformer;
437 
438         try
439         {
440             transformer = tf.newTransformer(  );
441             transformer.setOutputProperty( OutputKeys.OMIT_XML_DECLARATION, OMIT_XML_DECLARATION_TRUE );
442             transformer.setOutputProperty( OutputKeys.METHOD, METHOD_HTML );
443             transformer.transform( domSource, result );
444         }
445         catch ( TransformerConfigurationException e )
446         {
447             AppLogService.error( e.getMessage(  ) );
448 
449             return null;
450         }
451         catch ( TransformerException e )
452         {
453             AppLogService.error( e.getMessage(  ) );
454 
455             return null;
456         }
457 
458         return writer.getBuffer(  );
459     }
460 
461     /**
462      * Tranform the tags <input name="xxx" ... /> to <input name="page_external_site_xxx" ... />
463      */
464     private void changeInputs(  )
465     {
466         NodeList nodes2 = _content.getElementsByTagName( ELEMENT_INPUT.getTagName(  ) );
467 
468         for ( int j = 0; j < nodes2.getLength(  ); j++ )
469         {
470             Element node2 = (Element) nodes2.item( j );
471 
472             node2.setAttribute( TAG_INPUT_ATTRIBUTE_NAME,
473                 WebappcontainerApp.PARAMETER_PAGE_HACK + node2.getAttribute( TAG_INPUT_ATTRIBUTE_NAME ) );
474         }
475     }
476 
477     /**
478      * Transform the node content to XML
479      *
480      * @param node The node
481      * @return The node content in XML
482      */
483     private String getNodeContent( Node node )
484     {
485         DOMSource domSource = new DOMSource( node );
486         StringWriter writer = new StringWriter(  );
487         StreamResult result = new StreamResult( writer );
488         TransformerFactory tf = TransformerFactory.newInstance(  );
489         Transformer transformer;
490 
491         try
492         {
493             transformer = tf.newTransformer(  );
494             transformer.setOutputProperty( OutputKeys.OMIT_XML_DECLARATION, OMIT_XML_DECLARATION_TRUE );
495             transformer.setOutputProperty( OutputKeys.METHOD, METHOD_HTML );
496             transformer.setOutputProperty( OutputKeys.MEDIA_TYPE, "text/html" );
497             transformer.transform( domSource, result );
498         }
499         catch ( TransformerConfigurationException e )
500         {
501             AppLogService.error( e.getMessage(  ) );
502 
503             return null;
504         }
505         catch ( TransformerException e )
506         {
507             AppLogService.error( e.getMessage(  ) );
508 
509             return null;
510         }
511 
512         return writer.toString(  );
513     }
514 
515     /**
516      * provide a description for the HTML elements to be parsed
517      */
518     private static class ElementUrl
519     {
520         private String _strTagName;
521         private String _strAttributeName;
522         private String _strTestedAttributeName;
523         private String _strTestedAttributeValue;
524 
525         /**
526          * Instanciates an ElementUrl
527          *
528          * @param strTagName the tag name to get (example: link, script, img, ...)
529          * @param strAttributeName the attribute name to get (example: src, href, ...)
530          * @param strTestedAttributeName the attribute name to test
531          * @param strTestedAttributeValue the value of the attribute to test :
532          * if the value of the attribute strTestedAttributeName equals strTestedAttributeValue,
533          * then we get the element's url, else we do nothing.
534          */
535         public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName,
536             String strTestedAttributeValue )
537         {
538             _strTagName = strTagName;
539             _strAttributeName = strAttributeName;
540             _strTestedAttributeName = strTestedAttributeName;
541             _strTestedAttributeValue = strTestedAttributeValue;
542         }
543 
544         /**
545          * Returns the attributeName
546          * @return the attributeName
547          */
548         public String getAttributeName(  )
549         {
550             return _strAttributeName;
551         }
552 
553         /**
554          * Returns the tagName
555          * @return the tagName
556          */
557         public String getTagName(  )
558         {
559             return _strTagName;
560         }
561 
562         /**
563          * Returns the testedAttributeName
564          * @return the testedAttributeName
565          */
566         public String getTestedAttributeName(  )
567         {
568             return _strTestedAttributeName;
569         }
570 
571         /**
572          * Returns the testedAttributeValue
573          * @return the testedAttributeValue
574          */
575         public String getTestedAttributeValue(  )
576         {
577             return _strTestedAttributeValue;
578         }
579     }
580 }