TikaIndexerUtil.java
/*
* Copyright (c) 2002-2021, City of Paris
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice
* and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice
* and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* License 1.0
*/
package fr.paris.lutece.plugins.search.solr.util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
import fr.paris.lutece.portal.service.util.AppPathService;
import fr.paris.lutece.portal.service.util.AppPropertiesService;
/**
* Util class for use of tika methods.
*/
public final class TikaIndexerUtil
{
// 1Mo
private static final int DEFAULT_DOCUMENT_SIZE = 1048576;
private static final String CONFIG = AppPathService.getAbsolutePathFromRelativePath( "/WEB-INF/conf/tika.xml" );
private TikaIndexerUtil( )
{
}
/**
* Parse the xml content
*
* @param strContentToIndex
* @param metadata
* @param parseContext
* @return the content handler containing the parsed content
* @throws LuteceSolrException
*/
public static ContentHandler parseHtml( String strContentToIndex, Metadata metadata, ParseContext parseContext ) throws LuteceSolrException
{
try
{
ContentHandler handler = new BodyContentHandler( AppPropertiesService.getPropertyInt( "solr.document.max.size", DEFAULT_DOCUMENT_SIZE ) );
new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata, parseContext );
return handler;
}
catch( IOException | SAXException | TikaException e )
{
throw new LuteceSolrException( "Error parsing content", e );
}
}
/**
* Identify the type of content and parse the stream
*
* @param stream
* @return the content handler containing the parsed content
* @throws LuteceSolrException
*/
public static ContentHandler parse( InputStream stream, Metadata metadata, ParseContext parseContext ) throws LuteceSolrException
{
try
{
ContentHandler handler = new BodyContentHandler( );
AutoDetectParser parser = new AutoDetectParser( new TikaConfig( CONFIG ) );
parser.parse( stream, handler, metadata, parseContext );
return handler;
}
catch( IOException | SAXException | TikaException e )
{
throw new LuteceSolrException( "Error parsing content", e );
}
}
/**
* Identify the type of content and parse the stream
*
* @param stream
* @return the content handler containing the parsed content
* @throws LuteceSolrException
*/
public static ContentHandler parse( InputStream stream ) throws LuteceSolrException
{
return parse( stream, new Metadata( ), new ParseContext( ) );
}
/**
* Parse the xml content
*
* @param strContentToIndex
* @return the content handler containing the parsed content
* @throws LuteceSolrException
*/
public static ContentHandler parseHtml( String strContentToIndex ) throws LuteceSolrException
{
return parseHtml( strContentToIndex, new Metadata( ), new ParseContext( ) );
}
/**
* Parse and add the content of a file to the solr item.
*
* @param item
* @param fileContent
* the content of the file
* @throws LuteceSolrException
*/
public static void addFileContentToSolrItem( SolrItem item, byte [ ] fileContent ) throws LuteceSolrException
{
addFileContentToSolrItem( item, Collections.singletonList( fileContent ) );
}
/**
* Parse and add the content of multiples files to the solr item.
*
* @param item
* @param fileContentList
* the content of the files
* @throws LuteceSolrException
*/
public static void addFileContentToSolrItem( SolrItem item, List<byte [ ]> fileContentList ) throws LuteceSolrException
{
StringBuilder content = new StringBuilder( );
for ( byte [ ] fileContent : fileContentList )
{
content.append( " " );
try ( InputStream bais = new ByteArrayInputStream( fileContent ) )
{
ContentHandler handler = parse( bais );
content.append( handler.toString( ) );
}
catch( IOException e )
{
throw new LuteceSolrException( "Error while parsing file for item " + item.getUid( ), e );
}
}
item.setFileContent( content.toString( ) );
}
}