HtmlDocument.java

  1. /*
  2.  * Copyright (c) 2002-2022, City of Paris
  3.  * All rights reserved.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  *
  9.  *  1. Redistributions of source code must retain the above copyright notice
  10.  *     and the following disclaimer.
  11.  *
  12.  *  2. Redistributions in binary form must reproduce the above copyright notice
  13.  *     and the following disclaimer in the documentation and/or other materials
  14.  *     provided with the distribution.
  15.  *
  16.  *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
  17.  *     contributors may be used to endorse or promote products derived from
  18.  *     this software without specific prior written permission.
  19.  *
  20.  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21.  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22.  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23.  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
  24.  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25.  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26.  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27.  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28.  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29.  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30.  * POSSIBILITY OF SUCH DAMAGE.
  31.  *
  32.  * License 1.0
  33.  */
  34. package fr.paris.lutece.util.mail;

  35. import fr.paris.lutece.portal.service.util.AppLogService;

  36. import org.w3c.dom.Document;
  37. import org.w3c.dom.NamedNodeMap;
  38. import org.w3c.dom.Node;
  39. import org.w3c.dom.NodeList;

  40. import org.w3c.tidy.Tidy;

  41. import java.io.ByteArrayInputStream;

  42. import java.net.MalformedURLException;
  43. import java.net.URL;

  44. import java.util.ArrayList;
  45. import java.util.HashMap;
  46. import java.util.List;
  47. import java.util.Map;

  48. import javax.activation.DataHandler;

  49. /**
  50.  * This classes provides implementation to retrieve urls from specified tags on an HTML page.
  51.  */
  52. public class HtmlDocument
  53. {
  54.     // Definition of some basic html elements
  55.     /**
  56.      * To define a CSS, html element must have:
  57.      * <ul>
  58.      * <li>"link" tag name</li>
  59.      * <li>"rel" attribute equal to "stylesheet"</li>
  60.      * </ul>
  61.      * The url is contained in the attributed named "href"
  62.      */
  63.     public static final ElementUrl ELEMENT_CSS;

  64.     /**
  65.      * To define a javascript, html element must have:
  66.      * <ul>
  67.      * <li>"script" tag name</li>
  68.      * <li>"type" attribute equal to "text/javascript"</li>
  69.      * </ul>
  70.      * The url is contained in the attributed named "src"
  71.      */
  72.     public static final ElementUrl ELEMENT_JAVASCRIPT;

  73.     /**
  74.      * To define an image, html element must have:
  75.      * <ul>
  76.      * <li>"img" tag name</li>
  77.      * </ul>
  78.      * The url is contained in the attributed named "src"
  79.      */
  80.     public static final ElementUrl ELEMENT_IMG;

  81.     static
  82.     {
  83.         ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
  84.         ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
  85.         ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
  86.     }

  87.     private Document _content;
  88.     private String _strBaseUrl;
  89.     private boolean _useAbsoluteUrl;

  90.     /**
  91.      * Instanciates an HtmlDocument after having built the DOM tree.
  92.      *
  93.      * @param strHtml
  94.      *            The Html code to be parsed.
  95.      * @param strBaseUrl
  96.      *            The Base url used to retrieve urls.
  97.      * @param useAbsoluteUrl
  98.      *            Determine if we use absolute or relative url for HTML element's names
  99.      */
  100.     public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
  101.     {
  102.         // use of tidy to retrieve the DOM tree
  103.         Tidy tidy = new Tidy( );
  104.         tidy.setQuiet( true );
  105.         tidy.setShowWarnings( false );

  106.         _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
  107.         _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
  108.         _useAbsoluteUrl = useAbsoluteUrl;
  109.     }

  110.     /**
  111.      * Get the urls of all html elements specified by elementType
  112.      *
  113.      * @param elementType
  114.      *            the type of element to get
  115.      * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
  116.      */
  117.     public Map<String, URL> getAllUrls( ElementUrl elementType )
  118.     {
  119.         Map<String, URL> mapUrl = new HashMap<>( );

  120.         NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );

  121.         for ( int i = 0; i < nodes.getLength( ); i++ )
  122.         {
  123.             Node node = nodes.item( i );
  124.             NamedNodeMap attributes = node.getAttributes( );

  125.             // Test if the element matches the required attribute
  126.             if ( elementType.getTestedAttributeName( ) != null )
  127.             {
  128.                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );

  129.                 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
  130.                 {
  131.                     continue;
  132.                 }
  133.             }

  134.             // Retrieve the url, then test if it matches the base url
  135.             String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );

  136.             if ( strSrc.startsWith( _strBaseUrl ) )
  137.             {
  138.                 try
  139.                 {
  140.                     URL url = new URL( strSrc );
  141.                     mapUrl.put( getUrlName( url ), url );
  142.                 }
  143.                 catch( MalformedURLException e )
  144.                 {
  145.                     // ignored document
  146.                     AppLogService.info( " {} not found, location ignored.", strSrc );
  147.                 }
  148.             }
  149.         }

  150.         return mapUrl;
  151.     }

  152.     /**
  153.      * Get the urls of all html elements specified by elementType
  154.      *
  155.      * @param elementType
  156.      *            the type of element to get
  157.      * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
  158.      */
  159.     public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
  160.     {
  161.         List<UrlAttachment> listUrlAttachement = new ArrayList<>( );
  162.         NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );

  163.         for ( int i = 0; i < nodes.getLength( ); i++ )
  164.         {
  165.             Node node = nodes.item( i );
  166.             NamedNodeMap attributes = node.getAttributes( );

  167.             // Test if the element matches the required attribute
  168.             if ( elementType.getTestedAttributeName( ) != null )
  169.             {
  170.                 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );

  171.                 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
  172.                 {
  173.                     continue;
  174.                 }
  175.             }

  176.             // Retrieve the url, then test if it matches the base url
  177.             String strAttributeName = elementType.getAttributeName( );

  178.             if ( ( strAttributeName != null ) && ( attributes != null ) )
  179.             {
  180.                 Node attributeNode = attributes.getNamedItem( strAttributeName );
  181.                 createAttributeUrl( attributeNode, listUrlAttachement );
  182.             }
  183.         }

  184.         return listUrlAttachement;
  185.     }

  186.     private void createAttributeUrl( Node attributeNode, List<UrlAttachment> listUrlAttachement )
  187.     {
  188.         if ( attributeNode != null )
  189.         {
  190.             String strSrc = attributeNode.getNodeValue( );

  191.             if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
  192.             {
  193.                 try
  194.                 {
  195.                     URL url = new URL( strSrc );
  196.                     UrlAttachment urlAttachement = new UrlAttachment( getUrlName( url ), url );
  197.                     listUrlAttachement.add( urlAttachement );
  198.                 }
  199.                 catch( MalformedURLException e )
  200.                 {
  201.                     // ignored document
  202.                     AppLogService.info( " {} not found, location ignored.", strSrc );
  203.                 }
  204.             }
  205.         }
  206.     }

  207.     /**
  208.      * Loads the url in a DataHandler
  209.      *
  210.      * @param url
  211.      *            an absolute url
  212.      * @return an Object containing the DataHandler
  213.      */
  214.     protected Object getUrlContent( URL url )
  215.     {
  216.         return new DataHandler( url );
  217.     }

  218.     /**
  219.      * Return the absolute or relative url depending on _useAbsoluteUrl
  220.      *
  221.      * @param url
  222.      *            an absolute url
  223.      * @return a String representing the url
  224.      */
  225.     protected String getUrlName( URL url )
  226.     {
  227.         return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
  228.     }

  229.     /**
  230.      * provide a description for the HTML elements to be parsed
  231.      */
  232.     private static class ElementUrl
  233.     {
  234.         private String _strTagName;
  235.         private String _strAttributeName;
  236.         private String _strTestedAttributeName;
  237.         private String _strTestedAttributeValue;

  238.         /**
  239.          * Instanciates an ElementUrl
  240.          *
  241.          * @param strTagName
  242.          *            the tag name to get (example: link, script, img, ...)
  243.          * @param strAttributeName
  244.          *            the attribute name to get (example: src, href, ...)
  245.          * @param strTestedAttributeName
  246.          *            the attribute name to test
  247.          * @param strTestedAttributeValue
  248.          *            the value of the attribute to test : if the value of the attribute strTestedAttributeName equals strTestedAttributeValue, then we get the
  249.          *            element's url, else we do nothing.
  250.          */
  251.         public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
  252.         {
  253.             _strTagName = strTagName;
  254.             _strAttributeName = strAttributeName;
  255.             _strTestedAttributeName = strTestedAttributeName;
  256.             _strTestedAttributeValue = strTestedAttributeValue;
  257.         }

  258.         /**
  259.          * Returns the attributeName
  260.          *
  261.          * @return the attributeName
  262.          */
  263.         public String getAttributeName( )
  264.         {
  265.             return _strAttributeName;
  266.         }

  267.         /**
  268.          * Returns the tagName
  269.          *
  270.          * @return the tagName
  271.          */
  272.         public String getTagName( )
  273.         {
  274.             return _strTagName;
  275.         }

  276.         /**
  277.          * Returns the testedAttributeName
  278.          *
  279.          * @return the testedAttributeName
  280.          */
  281.         public String getTestedAttributeName( )
  282.         {
  283.             return _strTestedAttributeName;
  284.         }

  285.         /**
  286.          * Returns the testedAttributeValue
  287.          *
  288.          * @return the testedAttributeValue
  289.          */
  290.         public String getTestedAttributeValue( )
  291.         {
  292.             return _strTestedAttributeValue;
  293.         }
  294.     }
  295. }