PageIndexer.java
- /*
- * Copyright (c) 2002-2022, City of Paris
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice
- * and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice
- * and the following disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * 3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * License 1.0
- */
- package fr.paris.lutece.portal.service.search;
- import fr.paris.lutece.portal.business.page.Page;
- import fr.paris.lutece.portal.business.page.PageHome;
- import fr.paris.lutece.portal.service.message.SiteMessageException;
- import fr.paris.lutece.portal.service.page.IPageService;
- import fr.paris.lutece.portal.service.spring.SpringContextService;
- import fr.paris.lutece.portal.service.util.AppException;
- import fr.paris.lutece.portal.service.util.AppPropertiesService;
- import fr.paris.lutece.util.url.UrlItem;
- import org.apache.lucene.index.IndexOptions;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.lucene.document.DateTools;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.FieldType;
- import org.apache.lucene.document.StoredField;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.tika.exception.TikaException;
- import org.apache.tika.metadata.Metadata;
- import org.apache.tika.parser.ParseContext;
- import org.apache.tika.parser.html.HtmlParser;
- import org.apache.tika.sax.BodyContentHandler;
- import org.xml.sax.ContentHandler;
- import org.xml.sax.SAXException;
- import java.io.ByteArrayInputStream;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- /**
- * Indexer service for pages
- */
- public class PageIndexer implements SearchIndexer
- {
- public static final String INDEX_TYPE_PAGE = "Page";
- public static final String INDEXER_NAME = "PageIndexer";
- protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
- protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
- protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
- protected static final String PARAMETER_PAGE_ID = "page_id";
- private static IPageService _pageService = SpringContextService.getBean( "pageService" );
- private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
- private static final String INDEXER_VERSION = "1.0.0";
- /**
- * {@inheritDoc}
- */
- @Override
- public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
- {
- String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
- List<Page> listPages = PageHome.getAllPages( );
- for ( Page page : listPages )
- {
- UrlItem url = new UrlItem( strPageBaseUrl );
- url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
- Document doc = null;
- try
- {
- doc = getDocument( page, url.getUrl( ) );
- }
- catch( Exception e )
- {
- String strMessage = "Page ID : " + page.getId( );
- IndexationService.error( this, e, strMessage );
- }
- if ( doc != null )
- {
- IndexationService.write( doc );
- }
- }
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public List<Document> getDocuments( String nIdDocument ) throws IOException, InterruptedException, SiteMessageException
- {
- ArrayList<Document> listDocuments = new ArrayList<>( );
- String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
- Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );
- if ( ( page != null ) && ( page.getId( ) != 0 ) )
- {
- UrlItem url = new UrlItem( strPageBaseUrl );
- url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
- Document doc = getDocument( page, url.getUrl( ) );
- listDocuments.add( doc );
- }
- return listDocuments;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public String getName( )
- {
- return INDEXER_NAME;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public String getVersion( )
- {
- return INDEXER_VERSION;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public String getDescription( )
- {
- return INDEXER_DESCRIPTION;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public boolean isEnable( )
- {
- String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString( ) );
- return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) );
- }
- /**
- * Builds a document which will be used by Lucene during the indexing of the pages of the site with the following fields : summary, uid, url, contents,
- * title and description.
- *
- * @return the built Document
- * @param strUrl
- * The base URL for documents
- * @param page
- * the page to index
- * @throws IOException
- * The IO Exception
- * @throws InterruptedException
- * The InterruptedException
- * @throws SiteMessageException
- * occurs when a site message need to be displayed
- */
- protected Document getDocument( Page page, String strUrl ) throws IOException, InterruptedException, SiteMessageException
- {
- FieldType ft = new FieldType( StringField.TYPE_STORED );
- ft.setOmitNorms( false );
- FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
- ftNotStored.setOmitNorms( false );
- ftNotStored.setTokenized( false );
- FieldType ftDate = new FieldType( StringField.TYPE_STORED );
- ftDate.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
- ftDate.setStored( true );
- ftDate.setOmitNorms( false );
- FieldType ftUid = ftNotStored;
- ftUid.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
- // make a new, empty document
- Document doc = new Document( );
- // Add the url as a field named "url". Use an UnIndexed field, so
- // that the url is just stored with the document, but is not searchable.
- doc.add( new StoredField( SearchItem.FIELD_URL, strUrl ) );
- // Add the last modified date of the file a field named "modified".
- // Use a field that is indexed (i.e. searchable), but don't tokenize
- // the field into words.
- String strDate = DateTools.dateToString( page.getDateUpdate( ), DateTools.Resolution.DAY );
- doc.add( new Field( SearchItem.FIELD_DATE, strDate, ftDate ) );
- // Add the uid as a field, so that index can be incrementally maintained.
- // This field is not stored with document, it is indexed, but it is not
- // tokenized prior to indexing.
- String strIdPage = String.valueOf( page.getId( ) );
- doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftUid ) );
- String strPageContent = _pageService.getPageContent( page.getId( ), 0, null );
- ContentHandler handler = new BodyContentHandler( );
- Metadata metadata = new Metadata( );
- try
- {
- new HtmlParser( ).parse( new ByteArrayInputStream( strPageContent.getBytes( ) ), handler, metadata, new ParseContext( ) );
- }
- catch( TikaException | SAXException e )
- {
- throw new AppException( "Error during page parsing." );
- }
- // the content of the article is recovered in the parser because this one
- // had replaced the encoded caracters (as é) by the corresponding special caracter (as ?)
- StringBuilder sb = new StringBuilder( handler.toString( ) );
- // Add the tag-stripped contents as a Reader-valued Text field so it will
- // get tokenized and indexed.
- StringBuilder sbFieldContent = new StringBuilder( );
- StringBuilder sbFieldMetadata = new StringBuilder( );
- sbFieldContent.append( page.getName( ) ).append( " " ).append( sb.toString( ) );
- // Add the metadata description of the page if it exists
- if ( page.getDescription( ) != null )
- {
- sbFieldContent.append( " " ).append( page.getDescription( ) );
- }
- // Add the metadata keywords of the page if it exists
- String strMetaKeywords = page.getMetaKeywords( );
- if ( StringUtils.isNotBlank( strMetaKeywords ) )
- {
- sbFieldContent.append( " " ).append( strMetaKeywords );
- sbFieldMetadata.append( strMetaKeywords );
- }
- doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString( ), TextField.TYPE_NOT_STORED ) );
- if ( StringUtils.isNotBlank( page.getMetaDescription( ) ) )
- {
- if ( sbFieldMetadata.length( ) > 0 )
- {
- sbFieldMetadata.append( " " );
- }
- sbFieldMetadata.append( page.getMetaDescription( ) );
- }
- if ( sbFieldMetadata.length( ) > 0 )
- {
- doc.add( new Field( SearchItem.FIELD_METADATA, sbFieldMetadata.toString( ), TextField.TYPE_NOT_STORED ) );
- }
- // Add the title as a separate Text field, so that it can be searched
- // separately.
- doc.add( new Field( SearchItem.FIELD_TITLE, page.getName( ), ft ) );
- if ( StringUtils.isNotBlank( page.getDescription( ) ) )
- {
- // Add the summary as an UnIndexed field, so that it is stored and returned
- // with hit documents for display.
- doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription( ) ) );
- }
- doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
- doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole( ), ft ) );
- // return the document
- return doc;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public List<String> getListType( )
- {
- List<String> listType = new ArrayList<>( );
- listType.add( INDEX_TYPE_PAGE );
- return listType;
- }
- /**
- * {@inheritDoc}
- */
- @Override
- public String getSpecificSearchAppUrl( )
- {
- return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
- }
- }