DocSearchService.java
/*
* Copyright (c) 2002-2023, City of Paris
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright notice
* and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice
* and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* License 1.0
*/
package fr.paris.lutece.plugins.document.service.docsearch;
import fr.paris.lutece.plugins.document.business.DocumentHome;
import fr.paris.lutece.plugins.document.business.DocumentType;
import fr.paris.lutece.plugins.document.business.IndexerAction;
import fr.paris.lutece.plugins.document.business.IndexerActionFilter;
import fr.paris.lutece.plugins.document.business.IndexerActionHome;
import fr.paris.lutece.plugins.document.business.spaces.DocumentSpace;
import fr.paris.lutece.plugins.document.service.spaces.DocumentSpacesService;
import fr.paris.lutece.portal.business.user.AdminUser;
import fr.paris.lutece.portal.service.search.IndexationService;
import fr.paris.lutece.portal.service.spring.SpringContextService;
import fr.paris.lutece.portal.service.util.AppException;
import fr.paris.lutece.portal.service.util.AppLogService;
import fr.paris.lutece.portal.service.util.AppPathService;
import fr.paris.lutece.portal.service.util.AppPropertiesService;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
/**
* DocumentSearchService
*/
public class DocSearchService
{
// Constants corresponding to the variables defined in the lutece.properties file
public static final String PATH_INDEX = "document.docsearch.lucene.indexPath";
public static final String PARAM_FORCING = "forcing";
public static final String PATTERN_DATE = "dd/MM/yy";
private static final String PROPERTY_ANALYSER_CLASS_NAME = "document.docsearch.lucene.analyser.className";
private static final int MAX_RESPONSES = 1000000;
private static String _strIndex;
private static Analyzer _analyzer;
private static IndexSearcher _searcher;
private static DocSearchService _singleton;
private static IDocSearchIndexer _indexer;
/** Creates a new instance of DocumentSearchService */
private DocSearchService( )
{
// Read configuration properties
_strIndex = AppPathService.getPath( PATH_INDEX );
if ( ( _strIndex == null ) || ( _strIndex.equals( StringUtils.EMPTY ) ) )
{
throw new AppException( "Lucene index path not found in document.properties", null );
}
String strAnalyserClassName = AppPropertiesService.getProperty( PROPERTY_ANALYSER_CLASS_NAME );
if ( ( strAnalyserClassName == null ) || ( strAnalyserClassName.equals( StringUtils.EMPTY ) ) )
{
throw new AppException( "Analyser class name not found in lucene.properties", null );
}
_indexer = SpringContextService.getBean( "document.docSearchIndexer" );
try
{
_analyzer = (Analyzer) Class.forName( strAnalyserClassName ).newInstance( );
}
catch ( Exception e )
{
throw new AppException( "Failed to load Lucene Analyzer class", e );
}
}
/**
* The singleton
* @return instance of DocSearchService
*/
public static DocSearchService getInstance( )
{
if ( _singleton == null )
{
_singleton = new DocSearchService( );
}
return _singleton;
}
/**
* Indexing documents for searching
* @param bCreate tell if it's total indexing or total (total = true)
* @return indexing logs
*/
public String processIndexing( boolean bCreate )
{
StringBuilder sbLogs = new StringBuilder( );
IndexWriter writer = null;
boolean bCreateIndex = bCreate;
try
{
sbLogs.append( "\r\nIndexing all contents ...\r\n" );
Directory dir = NIOFSDirectory.open( Paths.get( _strIndex ) );
if ( !DirectoryReader.indexExists( dir ) )
{ //init index
bCreateIndex = true;
}
Date start = new Date( );
IndexWriterConfig conf = new IndexWriterConfig( _analyzer );
if ( bCreateIndex )
{
conf.setOpenMode( OpenMode.CREATE );
}
else
{
conf.setOpenMode( OpenMode.APPEND );
}
writer = new IndexWriter( dir, conf );
if ( !bCreateIndex )
{
//incremental indexing
//add all document which must be add
for ( IndexerAction action : getAllIndexerActionByTask( IndexerAction.TASK_CREATE ) )
{
ArrayList<Integer> luceneDocumentId = new ArrayList<Integer>( );
try
{
luceneDocumentId.add( action.getIdDocument( ) );
List<org.apache.lucene.document.Document> luceneDocument = _indexer.getDocuments( luceneDocumentId );
if ( ( luceneDocument != null ) && ( luceneDocument.size( ) > 0 ) )
{
Iterator<org.apache.lucene.document.Document> it = luceneDocument.iterator( );
while ( it.hasNext( ) )
{
org.apache.lucene.document.Document doc = it.next( );
writer.addDocument( doc );
sbLogs.append( "Adding " );
sbLogs.append( doc.get( DocSearchItem.FIELD_TYPE ) );
sbLogs.append( " #" );
sbLogs.append( doc.get( DocSearchItem.FIELD_UID ) );
sbLogs.append( " - " );
sbLogs.append( doc.get( DocSearchItem.FIELD_TITLE ) );
sbLogs.append( "\r\n" );
}
}
}
catch ( Exception e )
{
sbLogs.append( "Indexing DocId " + luceneDocumentId + " Error durign document indexation parsing.\r\n" );
sbLogs.append( "Caught a " );
sbLogs.append( e.getClass( ) );
sbLogs.append( "\r\n with message: " );
sbLogs.append( e.getMessage( ) );
AppLogService.error( "Indexing error : ", e );
}
removeIndexerAction( action.getIdAction( ) );
}
//Update all document which must be update
for ( IndexerAction action : getAllIndexerActionByTask( IndexerAction.TASK_MODIFY ) )
{
ArrayList<Integer> luceneDocumentId = new ArrayList<Integer>( );
try
{
luceneDocumentId.add( action.getIdDocument( ) );
List<org.apache.lucene.document.Document> luceneDocument = _indexer.getDocuments( luceneDocumentId );
if ( ( luceneDocument != null ) && ( luceneDocument.size( ) > 0 ) )
{
Iterator<org.apache.lucene.document.Document> it = luceneDocument.iterator( );
while ( it.hasNext( ) )
{
org.apache.lucene.document.Document doc = it.next( );
writer.updateDocument( new Term( DocSearchItem.FIELD_UID,
Integer.toString( action.getIdDocument( ) ) ), doc );
sbLogs.append( "Updating " );
sbLogs.append( doc.get( DocSearchItem.FIELD_TYPE ) );
sbLogs.append( " #" );
sbLogs.append( doc.get( DocSearchItem.FIELD_UID ) );
sbLogs.append( " - " );
sbLogs.append( doc.get( DocSearchItem.FIELD_TITLE ) );
sbLogs.append( "\r\n" );
}
}
}
catch ( Exception e )
{
sbLogs.append( "Indexing DocId " + luceneDocumentId + " Error durign document indexation parsing.\r\n" );
sbLogs.append( "Caught a " );
sbLogs.append( e.getClass( ) );
sbLogs.append( "\r\n with message: " );
sbLogs.append( e.getMessage( ) );
AppLogService.error( "Indexing error : ", e );
}
removeIndexerAction( action.getIdAction( ) );
}
//delete all document which must be delete
for ( IndexerAction action : getAllIndexerActionByTask( IndexerAction.TASK_DELETE ) )
{
writer.deleteDocuments( new Term( DocSearchItem.FIELD_UID,
Integer.toString( action.getIdDocument( ) ) ) );
sbLogs.append( "Deleting " );
sbLogs.append( " #" );
sbLogs.append( action.getIdDocument( ) );
sbLogs.append( "\r\n" );
removeIndexerAction( action.getIdAction( ) );
}
}
else
{
//delete all incremental action
removeAllIndexerAction( );
Collection<Integer> listIdDocuments = DocumentHome.findAllPrimaryKeys( );
for ( Integer nIdDocument : listIdDocuments )
{
ArrayList<Integer> luceneDocumentId = new ArrayList<Integer>( );
try
{
luceneDocumentId.add( nIdDocument );
List<Document> listDocuments = _indexer.getDocuments( luceneDocumentId );
for ( Document doc : listDocuments )
{
writer.addDocument( doc );
sbLogs.append( "Indexing " );
sbLogs.append( doc.get( DocSearchItem.FIELD_TYPE ) );
sbLogs.append( " #" );
sbLogs.append( doc.get( DocSearchItem.FIELD_UID ) );
sbLogs.append( " - " );
sbLogs.append( doc.get( DocSearchItem.FIELD_TITLE ) );
sbLogs.append( "\r\n" );
}
}
catch ( Exception e )
{
sbLogs.append( "Indexing DocId " + luceneDocumentId + " Error durign document indexation parsing.\r\n" );
sbLogs.append( "Caught a " );
sbLogs.append( e.getClass( ) );
sbLogs.append( "\r\n with message: " );
sbLogs.append( e.getMessage( ) );
AppLogService.error( "Indexing error : ", e );
}
}
}
Date end = new Date( );
sbLogs.append( "Duration of the treatment : " );
sbLogs.append( end.getTime( ) - start.getTime( ) );
sbLogs.append( " milliseconds\r\n" );
}
catch ( Exception e )
{
sbLogs.append( " caught a " );
sbLogs.append( e.getClass( ) );
sbLogs.append( "\n with message: " );
sbLogs.append( e.getMessage( ) );
sbLogs.append( "\r\n" );
AppLogService.error( "Indexing error : " + e.getMessage( ), e );
}
finally
{
try
{
if ( writer != null )
{
writer.close( );
}
}
catch ( IOException e )
{
AppLogService.error( e.getMessage( ), e );
}
}
return sbLogs.toString( );
}
/**
* Return search results
* @param strQuery The search query
* @param nStartIndex The start index
* @param user The user
* @return Results as a collection of SarchItem
*/
public List<DocSearchItem> getSearchResults( String strQuery, int nStartIndex, AdminUser user )
{
ArrayList<DocSearchItem> listResults = new ArrayList<DocSearchItem>( );
try( Directory directory = NIOFSDirectory.open( Paths.get( _strIndex ) ) ; IndexReader ir = DirectoryReader.open( directory ) ; )
{
_searcher = new IndexSearcher( ir );
QueryParser parser = new QueryParser( DocSearchItem.FIELD_CONTENTS,
_analyzer );
Query query = parser.parse( ( StringUtils.isNotBlank( strQuery ) ) ? strQuery : "*:*" );
List<DocumentSpace> listSpaces = DocumentSpacesService.getInstance( ).getUserAllowedSpaces( user );
Query[] filters = new Query[listSpaces.size( )];
int nIndex = 0;
for ( DocumentSpace space : listSpaces )
{
Query querySpace = new TermQuery( new Term( DocSearchItem.FIELD_SPACE, "s" + space.getId( ) ) );
filters[nIndex++] = querySpace;
}
BooleanQuery.Builder booleanQueryBuilderFilters = new BooleanQuery.Builder( );
for (Query filter: filters) {
booleanQueryBuilderFilters.add( filter , BooleanClause.Occur.SHOULD );
}
Query allFilters = booleanQueryBuilderFilters.build( );
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder( );
booleanQueryBuilder.add( allFilters, BooleanClause.Occur.FILTER );
booleanQueryBuilder.add( query, BooleanClause.Occur.MUST );
// Get results documents
TopDocs topDocs = _searcher.search( booleanQueryBuilder.build( ) , MAX_RESPONSES );
ScoreDoc[] hits = topDocs.scoreDocs;
for ( ScoreDoc hit : hits )
{
int docId = hit.doc;
Document document = _searcher.doc( docId );
DocSearchItem si = new DocSearchItem( document );
listResults.add( si );
}
}
catch ( Exception e )
{
AppLogService.error( e.getMessage( ), e );
}
return listResults;
}
/**
* Return advanced search results
* @param strQuery The search query
* @param bTitle true for query in title
* @param bSummary true for query in summary
* @param date for filtering the result by date
* @param documentType for filtering the result by type
* @return Results as a collection of SarchItem
*/
public List<DocSearchItem> getSearchResults( String strQuery, boolean bTitle, boolean bSummary, String date,
DocumentType documentType )
{
ArrayList<DocSearchItem> listResults = new ArrayList<DocSearchItem>( );
try( Directory directory = NIOFSDirectory.open( Paths.get( _strIndex ) ) ; IndexReader ir = DirectoryReader.open( directory ) ; )
{
_searcher = new IndexSearcher( ir );
Collection<String> queries = new ArrayList<String>( );
Collection<String> fields = new ArrayList<String>( );
Collection<BooleanClause.Occur> flags = new ArrayList<BooleanClause.Occur>( );
if ( bTitle )
{
Query queryTitle = new TermQuery( new Term( DocSearchItem.FIELD_TITLE, strQuery ) );
queries.add( queryTitle.toString( ) );
fields.add( DocSearchItem.FIELD_TITLE );
flags.add( BooleanClause.Occur.SHOULD );
}
if ( bSummary )
{
Query querySummary = new TermQuery( new Term( DocSearchItem.FIELD_SUMMARY, strQuery ) );
queries.add( querySummary.toString( ) );
fields.add( DocSearchItem.FIELD_SUMMARY );
flags.add( BooleanClause.Occur.SHOULD );
}
if ( !( bTitle ) && !( bSummary ) && !( strQuery.equals( StringUtils.EMPTY ) ) )
{
Query queryContents = new TermQuery( new Term( DocSearchItem.FIELD_CONTENTS, strQuery ) );
queries.add( queryContents.toString( ) );
fields.add( DocSearchItem.FIELD_CONTENTS );
flags.add( BooleanClause.Occur.SHOULD );
}
Query queryMulti = null;
if ( strQuery.equals( StringUtils.EMPTY ) )
{
if ( documentType != null )
{
Query queryType = new TermQuery( new Term( DocSearchItem.FIELD_TYPE,
"\"" + documentType.getName( ) + "\"" ) );
queries.add( queryType.toString( ) );
fields.add( DocSearchItem.FIELD_TYPE );
flags.add( BooleanClause.Occur.SHOULD );
}
if ( ( date != null ) && ( !date.equals( StringUtils.EMPTY ) ) )
{
String formatedDate = formatDate( date );
Query queryDate = new TermQuery( new Term( DocSearchItem.FIELD_DATE, formatedDate ) );
queries.add( queryDate.toString( ) );
fields.add( DocSearchItem.FIELD_DATE );
flags.add( BooleanClause.Occur.SHOULD );
}
KeywordAnalyzer analyzer = new KeywordAnalyzer( );
queryMulti = MultiFieldQueryParser.parse(
queries.toArray( new String[queries.size( )] ), fields.toArray( new String[fields.size( )] ),
flags.toArray( new BooleanClause.Occur[flags.size( )] ), analyzer );
}
else
{
queryMulti = MultiFieldQueryParser.parse(
queries.toArray( new String[queries.size( )] ), fields.toArray( new String[fields.size( )] ),
flags.toArray( new BooleanClause.Occur[flags.size( )] ), IndexationService.getAnalyser( ) );
}
List<Query> filterList = new ArrayList<Query>( );
if ( documentType != null )
{
Query queryType = new TermQuery( new Term( DocSearchItem.FIELD_TYPE, documentType.getName( ) ) );
filterList.add( queryType );
}
if ( ( date != null ) && ( !date.equals( StringUtils.EMPTY ) ) )
{
String formatedDate = formatDate( date );
Query queryDate = new TermQuery( new Term( DocSearchItem.FIELD_DATE, formatedDate ) );
filterList.add( queryDate );
}
TopDocs topDocs = null;
if ( filterList.size( ) > 0 )
{
BooleanQuery.Builder booleanQueryBuilderFilters = new BooleanQuery.Builder( );
for (Query filter: filterList) {
booleanQueryBuilderFilters.add( filter , BooleanClause.Occur.MUST );
}
Query allFilters = booleanQueryBuilderFilters.build( );
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder( );
booleanQueryBuilder.add( allFilters , BooleanClause.Occur.FILTER );
booleanQueryBuilder.add( queryMulti , BooleanClause.Occur.MUST );
topDocs = _searcher.search( booleanQueryBuilder.build( ), MAX_RESPONSES );
}
else
{
topDocs = _searcher.search( queryMulti, MAX_RESPONSES );
}
ScoreDoc[] hits = topDocs.scoreDocs;
for ( ScoreDoc hit : hits )
{
int docId = hit.doc;
Document document = _searcher.doc( docId );
listResults.add( new DocSearchItem( document ) );
}
}
catch ( Exception e )
{
AppLogService.error( e.getMessage( ), e );
}
return listResults;
}
/**
* return a list of IndexerAction by task key
* @param nIdTask the task key
* @return a list of IndexerAction
*/
public List<IndexerAction> getAllIndexerActionByTask( int nIdTask )
{
IndexerActionFilter filter = new IndexerActionFilter( );
filter.setIdTask( nIdTask );
return IndexerActionHome.getList( filter );
}
/**
* Remove a Indexer Action
* @param nIdAction the key of the action to remove
*/
public void removeIndexerAction( int nIdAction )
{
IndexerActionHome.remove( nIdAction );
}
/**
* Remove all Indexer Action
*
*/
public static void removeAllIndexerAction( )
{
IndexerActionHome.removeAll( );
}
/**
* Add Indexer Action to perform on a record
* @param nIdDocument the document id
* @param nIdTask the key of the action to do
*/
public void addIndexerAction( int nIdDocument, int nIdTask )
{
IndexerAction indexerAction = new IndexerAction( );
indexerAction.setIdDocument( nIdDocument );
indexerAction.setIdTask( nIdTask );
IndexerActionHome.create( indexerAction );
}
/**
* Format the date
* @param date the date
* @return formatedDate the formated date
*/
private String formatDate( String date )
{
DateFormat dateFormat = new SimpleDateFormat( PATTERN_DATE, Locale.FRENCH );
dateFormat.setLenient( false );
Date formatedDate;
try
{
formatedDate = dateFormat.parse( date.trim( ) );
}
catch ( ParseException e )
{
AppLogService.error( e );
return null;
}
return dateFormat.format( formatedDate );
}
}