HtmlDocument.java
/*
* Copyright (c) 2002-2022, City of Paris
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice
* and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice
* and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* License 1.0
*/
package fr.paris.lutece.util.mail;
import fr.paris.lutece.portal.service.util.AppLogService;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.activation.DataHandler;
/**
* This classes provides implementation to retrieve urls from specified tags on an HTML page.
*/
public class HtmlDocument
{
// Definition of some basic html elements
/**
* To define a CSS, html element must have:
* <ul>
* <li>"link" tag name</li>
* <li>"rel" attribute equal to "stylesheet"</li>
* </ul>
* The url is contained in the attributed named "href"
*/
public static final ElementUrl ELEMENT_CSS;
/**
* To define a javascript, html element must have:
* <ul>
* <li>"script" tag name</li>
* <li>"type" attribute equal to "text/javascript"</li>
* </ul>
* The url is contained in the attributed named "src"
*/
public static final ElementUrl ELEMENT_JAVASCRIPT;
/**
* To define an image, html element must have:
* <ul>
* <li>"img" tag name</li>
* </ul>
* The url is contained in the attributed named "src"
*/
public static final ElementUrl ELEMENT_IMG;
static
{
ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
}
private Document _content;
private String _strBaseUrl;
private boolean _useAbsoluteUrl;
/**
* Instanciates an HtmlDocument after having built the DOM tree.
*
* @param strHtml
* The Html code to be parsed.
* @param strBaseUrl
* The Base url used to retrieve urls.
* @param useAbsoluteUrl
* Determine if we use absolute or relative url for HTML element's names
*/
public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
{
// use of tidy to retrieve the DOM tree
Tidy tidy = new Tidy( );
tidy.setQuiet( true );
tidy.setShowWarnings( false );
_content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
_strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
_useAbsoluteUrl = useAbsoluteUrl;
}
/**
* Get the urls of all html elements specified by elementType
*
* @param elementType
* the type of element to get
* @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
*/
public Map<String, URL> getAllUrls( ElementUrl elementType )
{
Map<String, URL> mapUrl = new HashMap<>( );
NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
for ( int i = 0; i < nodes.getLength( ); i++ )
{
Node node = nodes.item( i );
NamedNodeMap attributes = node.getAttributes( );
// Test if the element matches the required attribute
if ( elementType.getTestedAttributeName( ) != null )
{
String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
{
continue;
}
}
// Retrieve the url, then test if it matches the base url
String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
if ( strSrc.startsWith( _strBaseUrl ) )
{
try
{
URL url = new URL( strSrc );
mapUrl.put( getUrlName( url ), url );
}
catch( MalformedURLException e )
{
// ignored document
AppLogService.info( " {} not found, location ignored.", strSrc );
}
}
}
return mapUrl;
}
/**
* Get the urls of all html elements specified by elementType
*
* @param elementType
* the type of element to get
* @return a Collection containing the urls. Those urls are Objects, as defined by getUrl().
*/
public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
{
List<UrlAttachment> listUrlAttachement = new ArrayList<>( );
NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
for ( int i = 0; i < nodes.getLength( ); i++ )
{
Node node = nodes.item( i );
NamedNodeMap attributes = node.getAttributes( );
// Test if the element matches the required attribute
if ( elementType.getTestedAttributeName( ) != null )
{
String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
{
continue;
}
}
// Retrieve the url, then test if it matches the base url
String strAttributeName = elementType.getAttributeName( );
if ( ( strAttributeName != null ) && ( attributes != null ) )
{
Node attributeNode = attributes.getNamedItem( strAttributeName );
createAttributeUrl( attributeNode, listUrlAttachement );
}
}
return listUrlAttachement;
}
private void createAttributeUrl( Node attributeNode, List<UrlAttachment> listUrlAttachement )
{
if ( attributeNode != null )
{
String strSrc = attributeNode.getNodeValue( );
if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
{
try
{
URL url = new URL( strSrc );
UrlAttachment urlAttachement = new UrlAttachment( getUrlName( url ), url );
listUrlAttachement.add( urlAttachement );
}
catch( MalformedURLException e )
{
// ignored document
AppLogService.info( " {} not found, location ignored.", strSrc );
}
}
}
}
/**
* Loads the url in a DataHandler
*
* @param url
* an absolute url
* @return an Object containing the DataHandler
*/
protected Object getUrlContent( URL url )
{
return new DataHandler( url );
}
/**
* Return the absolute or relative url depending on _useAbsoluteUrl
*
* @param url
* an absolute url
* @return a String representing the url
*/
protected String getUrlName( URL url )
{
return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
}
/**
* provide a description for the HTML elements to be parsed
*/
private static class ElementUrl
{
private String _strTagName;
private String _strAttributeName;
private String _strTestedAttributeName;
private String _strTestedAttributeValue;
/**
* Instanciates an ElementUrl
*
* @param strTagName
* the tag name to get (example: link, script, img, ...)
* @param strAttributeName
* the attribute name to get (example: src, href, ...)
* @param strTestedAttributeName
* the attribute name to test
* @param strTestedAttributeValue
* the value of the attribute to test : if the value of the attribute strTestedAttributeName equals strTestedAttributeValue, then we get the
* element's url, else we do nothing.
*/
public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
{
_strTagName = strTagName;
_strAttributeName = strAttributeName;
_strTestedAttributeName = strTestedAttributeName;
_strTestedAttributeValue = strTestedAttributeValue;
}
/**
* Returns the attributeName
*
* @return the attributeName
*/
public String getAttributeName( )
{
return _strAttributeName;
}
/**
* Returns the tagName
*
* @return the tagName
*/
public String getTagName( )
{
return _strTagName;
}
/**
* Returns the testedAttributeName
*
* @return the testedAttributeName
*/
public String getTestedAttributeName( )
{
return _strTestedAttributeName;
}
/**
* Returns the testedAttributeValue
*
* @return the testedAttributeValue
*/
public String getTestedAttributeValue( )
{
return _strTestedAttributeValue;
}
}
}