HtmlDocument.java
- /*
- * Copyright (c) 2002-2022, City of Paris
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice
- * and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice
- * and the following disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * License 1.0
- */
- package fr.paris.lutece.util.mail;
- import fr.paris.lutece.portal.service.util.AppLogService;
- import org.w3c.dom.Document;
- import org.w3c.dom.NamedNodeMap;
- import org.w3c.dom.Node;
- import org.w3c.dom.NodeList;
- import org.w3c.tidy.Tidy;
- import java.io.ByteArrayInputStream;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import javax.activation.DataHandler;
- /**
- * This classes provides implementation to retrieve urls from specified tags on an HTML page.
- */
- public class HtmlDocument
- {
- // Definition of some basic html elements
- /**
- * To define a CSS, html element must have:
- * <ul>
- * <li>"link" tag name</li>
- * <li>"rel" attribute equal to "stylesheet"</li>
- * </ul>
- * The url is contained in the attributed named "href"
- */
- public static final ElementUrl ELEMENT_CSS;
- /**
- * To define a javascript, html element must have:
- * <ul>
- * <li>"script" tag name</li>
- * <li>"type" attribute equal to "text/javascript"</li>
- * </ul>
- * The url is contained in the attributed named "src"
- */
- public static final ElementUrl ELEMENT_JAVASCRIPT;
- /**
- * To define an image, html element must have:
- * <ul>
- * <li>"img" tag name</li>
- * </ul>
- * The url is contained in the attributed named "src"
- */
- public static final ElementUrl ELEMENT_IMG;
- static
- {
- ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
- ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
- ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
- }
- private Document _content;
- private String _strBaseUrl;
- private boolean _useAbsoluteUrl;
- /**
- * Instanciates an HtmlDocument after having built the DOM tree.
- *
- * @param strHtml
- * The Html code to be parsed.
- * @param strBaseUrl
- * The Base url used to retrieve urls.
- * @param useAbsoluteUrl
- * Determine if we use absolute or relative url for HTML element's names
- */
- public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
- {
- // use of tidy to retrieve the DOM tree
- Tidy tidy = new Tidy( );
- tidy.setQuiet( true );
- tidy.setShowWarnings( false );
- _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
- _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
- _useAbsoluteUrl = useAbsoluteUrl;
- }
- /**
- * Get the urls of all html elements specified by elementType
- *
- * @param elementType
- * the type of element to get
- * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
- */
- public Map<String, URL> getAllUrls( ElementUrl elementType )
- {
- Map<String, URL> mapUrl = new HashMap<>( );
- NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
- for ( int i = 0; i < nodes.getLength( ); i++ )
- {
- Node node = nodes.item( i );
- NamedNodeMap attributes = node.getAttributes( );
- // Test if the element matches the required attribute
- if ( elementType.getTestedAttributeName( ) != null )
- {
- String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
- if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
- {
- continue;
- }
- }
- // Retrieve the url, then test if it matches the base url
- String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
- if ( strSrc.startsWith( _strBaseUrl ) )
- {
- try
- {
- URL url = new URL( strSrc );
- mapUrl.put( getUrlName( url ), url );
- }
- catch( MalformedURLException e )
- {
- // ignored document
- AppLogService.info( " {} not found, location ignored.", strSrc );
- }
- }
- }
- return mapUrl;
- }
- /**
- * Get the urls of all html elements specified by elementType
- *
- * @param elementType
- * the type of element to get
- * @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
- */
- public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
- {
- List<UrlAttachment> listUrlAttachement = new ArrayList<>( );
- NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
- for ( int i = 0; i < nodes.getLength( ); i++ )
- {
- Node node = nodes.item( i );
- NamedNodeMap attributes = node.getAttributes( );
- // Test if the element matches the required attribute
- if ( elementType.getTestedAttributeName( ) != null )
- {
- String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
- if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
- {
- continue;
- }
- }
- // Retrieve the url, then test if it matches the base url
- String strAttributeName = elementType.getAttributeName( );
- if ( ( strAttributeName != null ) && ( attributes != null ) )
- {
- Node attributeNode = attributes.getNamedItem( strAttributeName );
- createAttributeUrl( attributeNode, listUrlAttachement );
- }
- }
- return listUrlAttachement;
- }
- private void createAttributeUrl( Node attributeNode, List<UrlAttachment> listUrlAttachement )
- {
- if ( attributeNode != null )
- {
- String strSrc = attributeNode.getNodeValue( );
- if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
- {
- try
- {
- URL url = new URL( strSrc );
- UrlAttachment urlAttachement = new UrlAttachment( getUrlName( url ), url );
- listUrlAttachement.add( urlAttachement );
- }
- catch( MalformedURLException e )
- {
- // ignored document
- AppLogService.info( " {} not found, location ignored.", strSrc );
- }
- }
- }
- }
- /**
- * Loads the url in a DataHandler
- *
- * @param url
- * an absolute url
- * @return an Object containing the DataHandler
- */
- protected Object getUrlContent( URL url )
- {
- return new DataHandler( url );
- }
- /**
- * Return the absolute or relative url depending on _useAbsoluteUrl
- *
- * @param url
- * an absolute url
- * @return a String representing the url
- */
- protected String getUrlName( URL url )
- {
- return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
- }
- /**
- * provide a description for the HTML elements to be parsed
- */
- private static class ElementUrl
- {
- private String _strTagName;
- private String _strAttributeName;
- private String _strTestedAttributeName;
- private String _strTestedAttributeValue;
- /**
- * Instanciates an ElementUrl
- *
- * @param strTagName
- * the tag name to get (example: link, script, img, ...)
- * @param strAttributeName
- * the attribute name to get (example: src, href, ...)
- * @param strTestedAttributeName
- * the attribute name to test
- * @param strTestedAttributeValue
- * the value of the attribute to test : if the value of the attribute strTestedAttributeName equals strTestedAttributeValue, then we get the
- * element's url, else we do nothing.
- */
- public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
- {
- _strTagName = strTagName;
- _strAttributeName = strAttributeName;
- _strTestedAttributeName = strTestedAttributeName;
- _strTestedAttributeValue = strTestedAttributeValue;
- }
- /**
- * Returns the attributeName
- *
- * @return the attributeName
- */
- public String getAttributeName( )
- {
- return _strAttributeName;
- }
- /**
- * Returns the tagName
- *
- * @return the tagName
- */
- public String getTagName( )
- {
- return _strTagName;
- }
- /**
- * Returns the testedAttributeName
- *
- * @return the testedAttributeName
- */
- public String getTestedAttributeName( )
- {
- return _strTestedAttributeName;
- }
- /**
- * Returns the testedAttributeValue
- *
- * @return the testedAttributeValue
- */
- public String getTestedAttributeValue( )
- {
- return _strTestedAttributeValue;
- }
- }
- }