SolrPageIndexer.java
/*
* Copyright (c) 2002-2021, City of Paris
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice
* and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice
* and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* License 1.0
*/
package fr.paris.lutece.plugins.search.solr.indexer;
import java.util.ArrayList;
import java.util.List;
import org.xml.sax.ContentHandler;
import fr.paris.lutece.plugins.search.solr.business.field.Field;
import fr.paris.lutece.plugins.search.solr.util.LuteceSolrException;
import fr.paris.lutece.plugins.search.solr.util.LuteceSolrRuntimeException;
import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
import fr.paris.lutece.plugins.search.solr.util.TikaIndexerUtil;
import fr.paris.lutece.portal.business.page.Page;
import fr.paris.lutece.portal.business.page.PageHome;
import fr.paris.lutece.portal.service.message.SiteMessageException;
import fr.paris.lutece.portal.service.page.IPageService;
import fr.paris.lutece.portal.service.spring.SpringContextService;
import fr.paris.lutece.portal.service.util.AppException;
import fr.paris.lutece.portal.service.util.AppLogService;
import fr.paris.lutece.portal.service.util.AppPropertiesService;
import fr.paris.lutece.util.url.UrlItem;
/**
* The indexer service for Solr.
*
*/
public class SolrPageIndexer implements SolrIndexer
{
public static final String RESSOURCE_PAGE = "PAGE_PAGE";
public static final String NAME = "SolrPageIndexer";
private static final String PARAMETER_PAGE_ID = "page_id";
private static final String DESCRIPTION = "Solr page Indexer";
private static final String VERSION = "1.0.0";
private static final String TYPE = "PAGE";
private static final String CATEGORIE = "Html";
private static final String PROPERTY_INDEXER_ENABLE = "solr.indexer.page.enable";
private static final String BEAN_PAGE_SERVICE = "pageService";
private static final String SHORT_NAME = "page";
private static final List<String> LIST_RESSOURCES_NAME = new ArrayList<>( );
private static final String PAGE_INDEXATION_ERROR = "[SolrPageIndexer] An error occured during the indexation of the page number ";
/**
* Creates a new SolrPageIndexer
*/
public SolrPageIndexer( )
{
LIST_RESSOURCES_NAME.add( RESSOURCE_PAGE );
}
/**
* {@inheritDoc}
*/
public List<String> indexDocuments( )
{
List<Page> listPages = PageHome.getAllPages( );
List<String> lstErrors = new ArrayList<>( );
List<SolrItem> lstSolrItems = new ArrayList<>( );
for ( Page page : listPages )
{
try
{
// Generates the item to index
SolrItem item = getItem( page, SolrIndexerService.getBaseUrl( ) );
lstSolrItems.add( item );
}
catch( Exception e )
{
lstErrors.add( PAGE_INDEXATION_ERROR + page.getId( ) + " : " + SolrIndexerService.buildErrorMessage( e ) );
AppLogService.error( PAGE_INDEXATION_ERROR + page.getId( ), e );
}
}
try
{
SolrIndexerService.write( lstSolrItems );
}
catch( Exception e )
{
lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
AppLogService.error( PAGE_INDEXATION_ERROR, e );
}
return lstErrors;
}
/**
* Builds a document which will be used by Lucene during the indexing of the pages of the site with the following fields : summary, uid, url, contents,
* title and description.
*
* @return the built Document
* @param strUrl
* The base URL for documents
* @param page
* the page to index
* @throws SiteMessageException
* occurs when a site message need to be displayed
*/
private SolrItem getItem( Page page, String strUrl ) throws SiteMessageException
{
// the item
SolrItem item = new SolrItem( );
// indexing page content
IPageService pageService = SpringContextService.getBean( BEAN_PAGE_SERVICE );
String strPageContent = pageService.getPageContent( page.getId( ), 0, null );
try
{
ContentHandler handler = TikaIndexerUtil.parseHtml( strPageContent );
// the content of the article is recovered in the parser because this one
// had replaced the encoded caracters (as é) by the corresponding special caracter (as ?)
item.setContent( handler.toString( ) );
}
catch( LuteceSolrException e )
{
throw new AppException( "Error during page parsing.", e );
}
item.setTitle( page.getName( ) );
item.setRole( page.getRole( ) );
if ( ( page.getDescription( ) != null ) && ( page.getDescription( ).length( ) > 1 ) )
{
item.setSummary( page.getDescription( ) );
}
item.setType( TYPE );
item.setSite( SolrIndexerService.getWebAppName( ) );
List<String> cat = new ArrayList<>( );
cat.add( CATEGORIE );
item.setCategorie( cat );
item.setDate( page.getDateUpdate( ) );
UrlItem urlItem = new UrlItem( strUrl );
urlItem.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
item.setUrl( urlItem.getUrl( ) );
item.setUid( getResourceUid( String.valueOf( page.getId( ) ), RESSOURCE_PAGE ) );
return item;
}
/**
* Returns the name of the indexer.
*
* @return the name of the indexer
*/
@Override
public String getName( )
{
return NAME;
}
/**
* Returns the version.
*
* @return the version.
*/
@Override
public String getVersion( )
{
return VERSION;
}
/**
* {@inheritDoc}
*/
@Override
public String getDescription( )
{
return DESCRIPTION;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isEnable( )
{
return "true".equalsIgnoreCase( AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE ) );
}
/**
* {@inheritDoc}
*/
@Override
public List<Field> getAdditionalFields( )
{
return null;
}
/**
* {@inheritDoc}
*/
@Override
public List<SolrItem> getDocuments( String strIdDocument )
{
List<SolrItem> lstItems = new ArrayList<>( );
try
{
int nIdDocument = Integer.parseInt( strIdDocument );
Page page = PageHome.getPage( nIdDocument );
lstItems.add( getItem( page, SolrIndexerService.getBaseUrl( ) ) );
}
catch( Exception e )
{
throw new LuteceSolrRuntimeException( e.getMessage( ), e );
}
return lstItems;
}
/**
* {@inheritDoc}
*/
@Override
public List<String> getResourcesName( )
{
return LIST_RESSOURCES_NAME;
}
/**
* {@inheritDoc}
*/
@Override
public String getResourceUid( String strResourceId, String strResourceType )
{
StringBuilder sb = new StringBuilder( strResourceId );
sb.append( SolrConstants.CONSTANT_UNDERSCORE ).append( SHORT_NAME );
return sb.toString( );
}
}