PageIndexer.java

  1. /*
  2.  * Copyright (c) 2002-2022, City of Paris
  3.  * All rights reserved.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  *
  9.  *  1. Redistributions of source code must retain the above copyright notice
  10.  *     and the following disclaimer.
  11.  *
  12.  *  2. Redistributions in binary form must reproduce the above copyright notice
  13.  *     and the following disclaimer in the documentation and/or other materials
  14.  *     provided with the distribution.
  15.  *
  16.  *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
  17.  *     contributors may be used to endorse or promote products derived from
  18.  *     this software without specific prior written permission.
  19.  *
  20.  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21.  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22.  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23.  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
  24.  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25.  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26.  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27.  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28.  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29.  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30.  * POSSIBILITY OF SUCH DAMAGE.
  31.  *
  32.  * License 1.0
  33.  */
  34. package fr.paris.lutece.portal.service.search;

  35. import fr.paris.lutece.portal.business.page.Page;
  36. import fr.paris.lutece.portal.business.page.PageHome;
  37. import fr.paris.lutece.portal.service.message.SiteMessageException;
  38. import fr.paris.lutece.portal.service.page.IPageService;
  39. import fr.paris.lutece.portal.service.spring.SpringContextService;
  40. import fr.paris.lutece.portal.service.util.AppException;
  41. import fr.paris.lutece.portal.service.util.AppPropertiesService;
  42. import fr.paris.lutece.util.url.UrlItem;
  43. import org.apache.lucene.index.IndexOptions;

  44. import org.apache.commons.lang3.StringUtils;

  45. import org.apache.lucene.document.DateTools;
  46. import org.apache.lucene.document.Document;
  47. import org.apache.lucene.document.Field;
  48. import org.apache.lucene.document.FieldType;
  49. import org.apache.lucene.document.StoredField;
  50. import org.apache.lucene.document.StringField;
  51. import org.apache.lucene.document.TextField;

  52. import org.apache.tika.exception.TikaException;
  53. import org.apache.tika.metadata.Metadata;
  54. import org.apache.tika.parser.ParseContext;
  55. import org.apache.tika.parser.html.HtmlParser;
  56. import org.apache.tika.sax.BodyContentHandler;

  57. import org.xml.sax.ContentHandler;
  58. import org.xml.sax.SAXException;

  59. import java.io.ByteArrayInputStream;
  60. import java.io.IOException;

  61. import java.util.ArrayList;
  62. import java.util.List;

  63. /**
  64.  * Indexer service for pages
  65.  */
  66. public class PageIndexer implements SearchIndexer
  67. {
  68.     public static final String INDEX_TYPE_PAGE = "Page";
  69.     public static final String INDEXER_NAME = "PageIndexer";
  70.     protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
  71.     protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
  72.     protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
  73.     protected static final String PARAMETER_PAGE_ID = "page_id";
  74.     private static IPageService _pageService = SpringContextService.getBean( "pageService" );
  75.     private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
  76.     private static final String INDEXER_VERSION = "1.0.0";

  77.     /**
  78.      * {@inheritDoc}
  79.      */
  80.     @Override
  81.     public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
  82.     {
  83.         String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
  84.         List<Page> listPages = PageHome.getAllPages( );

  85.         for ( Page page : listPages )
  86.         {
  87.             UrlItem url = new UrlItem( strPageBaseUrl );
  88.             url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );

  89.             Document doc = null;

  90.             try
  91.             {
  92.                 doc = getDocument( page, url.getUrl( ) );
  93.             }
  94.             catch( Exception e )
  95.             {
  96.                 String strMessage = "Page ID : " + page.getId( );
  97.                 IndexationService.error( this, e, strMessage );
  98.             }

  99.             if ( doc != null )
  100.             {
  101.                 IndexationService.write( doc );
  102.             }
  103.         }
  104.     }

  105.     /**
  106.      * {@inheritDoc}
  107.      */
  108.     @Override
  109.     public List<Document> getDocuments( String nIdDocument ) throws IOException, InterruptedException, SiteMessageException
  110.     {
  111.         ArrayList<Document> listDocuments = new ArrayList<>( );
  112.         String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );

  113.         Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );

  114.         if ( ( page != null ) && ( page.getId( ) != 0 ) )
  115.         {
  116.             UrlItem url = new UrlItem( strPageBaseUrl );
  117.             url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );

  118.             Document doc = getDocument( page, url.getUrl( ) );
  119.             listDocuments.add( doc );
  120.         }

  121.         return listDocuments;
  122.     }

  123.     /**
  124.      * {@inheritDoc}
  125.      */
  126.     @Override
  127.     public String getName( )
  128.     {
  129.         return INDEXER_NAME;
  130.     }

  131.     /**
  132.      * {@inheritDoc}
  133.      */
  134.     @Override
  135.     public String getVersion( )
  136.     {
  137.         return INDEXER_VERSION;
  138.     }

  139.     /**
  140.      * {@inheritDoc}
  141.      */
  142.     @Override
  143.     public String getDescription( )
  144.     {
  145.         return INDEXER_DESCRIPTION;
  146.     }

  147.     /**
  148.      * {@inheritDoc}
  149.      */
  150.     @Override
  151.     public boolean isEnable( )
  152.     {
  153.         String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString( ) );

  154.         return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) );
  155.     }

  156.     /**
  157.      * Builds a document which will be used by Lucene during the indexing of the pages of the site with the following fields : summary, uid, url, contents,
  158.      * title and description.
  159.      *
  160.      * @return the built Document
  161.      * @param strUrl
  162.      *            The base URL for documents
  163.      * @param page
  164.      *            the page to index
  165.      * @throws IOException
  166.      *             The IO Exception
  167.      * @throws InterruptedException
  168.      *             The InterruptedException
  169.      * @throws SiteMessageException
  170.      *             occurs when a site message need to be displayed
  171.      */
  172.     protected Document getDocument( Page page, String strUrl ) throws IOException, InterruptedException, SiteMessageException
  173.     {
  174.         FieldType ft = new FieldType( StringField.TYPE_STORED );
  175.         ft.setOmitNorms( false );

  176.         FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
  177.         ftNotStored.setOmitNorms( false );
  178.         ftNotStored.setTokenized( false );

  179.         FieldType ftDate = new FieldType( StringField.TYPE_STORED );
  180.         ftDate.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
  181.         ftDate.setStored( true );
  182.         ftDate.setOmitNorms( false );

  183.         FieldType ftUid = ftNotStored;
  184.         ftUid.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
  185.         // make a new, empty document
  186.         Document doc = new Document( );

  187.         // Add the url as a field named "url". Use an UnIndexed field, so
  188.         // that the url is just stored with the document, but is not searchable.
  189.         doc.add( new StoredField( SearchItem.FIELD_URL, strUrl ) );

  190.         // Add the last modified date of the file a field named "modified".
  191.         // Use a field that is indexed (i.e. searchable), but don't tokenize
  192.         // the field into words.
  193.         String strDate = DateTools.dateToString( page.getDateUpdate( ), DateTools.Resolution.DAY );
  194.         doc.add( new Field( SearchItem.FIELD_DATE, strDate, ftDate ) );

  195.         // Add the uid as a field, so that index can be incrementally maintained.
  196.         // This field is not stored with document, it is indexed, but it is not
  197.         // tokenized prior to indexing.
  198.         String strIdPage = String.valueOf( page.getId( ) );
  199.         doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftUid ) );

  200.         String strPageContent = _pageService.getPageContent( page.getId( ), 0, null );
  201.         ContentHandler handler = new BodyContentHandler( );
  202.         Metadata metadata = new Metadata( );

  203.         try
  204.         {
  205.             new HtmlParser( ).parse( new ByteArrayInputStream( strPageContent.getBytes( ) ), handler, metadata, new ParseContext( ) );
  206.         }
  207.         catch( TikaException | SAXException e )
  208.         {
  209.             throw new AppException( "Error during page parsing." );
  210.         }

  211.         // the content of the article is recovered in the parser because this one
  212.         // had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
  213.         StringBuilder sb = new StringBuilder( handler.toString( ) );

  214.         // Add the tag-stripped contents as a Reader-valued Text field so it will
  215.         // get tokenized and indexed.
  216.         StringBuilder sbFieldContent = new StringBuilder( );
  217.         StringBuilder sbFieldMetadata = new StringBuilder( );
  218.         sbFieldContent.append( page.getName( ) ).append( " " ).append( sb.toString( ) );

  219.         // Add the metadata description of the page if it exists
  220.         if ( page.getDescription( ) != null )
  221.         {
  222.             sbFieldContent.append( " " ).append( page.getDescription( ) );
  223.         }

  224.         // Add the metadata keywords of the page if it exists
  225.         String strMetaKeywords = page.getMetaKeywords( );

  226.         if ( StringUtils.isNotBlank( strMetaKeywords ) )
  227.         {
  228.             sbFieldContent.append( " " ).append( strMetaKeywords );
  229.             sbFieldMetadata.append( strMetaKeywords );
  230.         }

  231.         doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString( ), TextField.TYPE_NOT_STORED ) );

  232.         if ( StringUtils.isNotBlank( page.getMetaDescription( ) ) )
  233.         {
  234.             if ( sbFieldMetadata.length( ) > 0 )
  235.             {
  236.                 sbFieldMetadata.append( " " );
  237.             }

  238.             sbFieldMetadata.append( page.getMetaDescription( ) );
  239.         }

  240.         if ( sbFieldMetadata.length( ) > 0 )
  241.         {
  242.             doc.add( new Field( SearchItem.FIELD_METADATA, sbFieldMetadata.toString( ), TextField.TYPE_NOT_STORED ) );
  243.         }

  244.         // Add the title as a separate Text field, so that it can be searched
  245.         // separately.
  246.         doc.add( new Field( SearchItem.FIELD_TITLE, page.getName( ), ft ) );

  247.         if ( StringUtils.isNotBlank( page.getDescription( ) ) )
  248.         {
  249.             // Add the summary as an UnIndexed field, so that it is stored and returned
  250.             // with hit documents for display.
  251.             doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription( ) ) );
  252.         }

  253.         doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
  254.         doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole( ), ft ) );

  255.         // return the document
  256.         return doc;
  257.     }

  258.     /**
  259.      * {@inheritDoc}
  260.      */
  261.     @Override
  262.     public List<String> getListType( )
  263.     {
  264.         List<String> listType = new ArrayList<>( );
  265.         listType.add( INDEX_TYPE_PAGE );

  266.         return listType;
  267.     }

  268.     /**
  269.      * {@inheritDoc}
  270.      */
  271.     @Override
  272.     public String getSpecificSearchAppUrl( )
  273.     {
  274.         return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
  275.     }
  276. }