View Javadoc
1   /*
2    * Copyright (c) 2002-2022, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.portal.service.search;
35  
36  import fr.paris.lutece.portal.business.page.Page;
37  import fr.paris.lutece.portal.business.page.PageHome;
38  import fr.paris.lutece.portal.service.message.SiteMessageException;
39  import fr.paris.lutece.portal.service.page.IPageService;
40  import fr.paris.lutece.portal.service.spring.SpringContextService;
41  import fr.paris.lutece.portal.service.util.AppException;
42  import fr.paris.lutece.portal.service.util.AppPropertiesService;
43  import fr.paris.lutece.util.url.UrlItem;
44  import org.apache.lucene.index.IndexOptions;
45  
46  import org.apache.commons.lang3.StringUtils;
47  
48  import org.apache.lucene.document.DateTools;
49  import org.apache.lucene.document.Document;
50  import org.apache.lucene.document.Field;
51  import org.apache.lucene.document.FieldType;
52  import org.apache.lucene.document.StoredField;
53  import org.apache.lucene.document.StringField;
54  import org.apache.lucene.document.TextField;
55  
56  import org.apache.tika.exception.TikaException;
57  import org.apache.tika.metadata.Metadata;
58  import org.apache.tika.parser.ParseContext;
59  import org.apache.tika.parser.html.HtmlParser;
60  import org.apache.tika.sax.BodyContentHandler;
61  
62  import org.xml.sax.ContentHandler;
63  import org.xml.sax.SAXException;
64  
65  import java.io.ByteArrayInputStream;
66  import java.io.IOException;
67  
68  import java.util.ArrayList;
69  import java.util.List;
70  
71  /**
72   * Indexer service for pages
73   */
74  public class PageIndexer implements SearchIndexer
75  {
76      public static final String INDEX_TYPE_PAGE = "Page";
77      public static final String INDEXER_NAME = "PageIndexer";
78      protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
79      protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
80      protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
81      protected static final String PARAMETER_PAGE_ID = "page_id";
82      private static IPageService _pageService = SpringContextService.getBean( "pageService" );
83      private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
84      private static final String INDEXER_VERSION = "1.0.0";
85  
86      /**
87       * {@inheritDoc}
88       */
89      @Override
90      public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
91      {
92          String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
93          List<Page> listPages = PageHome.getAllPages( );
94  
95          for ( Page page : listPages )
96          {
97              UrlItem/url/UrlItem.html#UrlItem">UrlItem url = new UrlItem( strPageBaseUrl );
98              url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
99  
100             Document doc = null;
101 
102             try
103             {
104                 doc = getDocument( page, url.getUrl( ) );
105             }
106             catch( Exception e )
107             {
108                 String strMessage = "Page ID : " + page.getId( );
109                 IndexationService.error( this, e, strMessage );
110             }
111 
112             if ( doc != null )
113             {
114                 IndexationService.write( doc );
115             }
116         }
117     }
118 
119     /**
120      * {@inheritDoc}
121      */
122     @Override
123     public List<Document> getDocuments( String nIdDocument ) throws IOException, InterruptedException, SiteMessageException
124     {
125         ArrayList<Document> listDocuments = new ArrayList<>( );
126         String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
127 
128         Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );
129 
130         if ( ( page != null ) && ( page.getId( ) != 0 ) )
131         {
132             UrlItem/url/UrlItem.html#UrlItem">UrlItem url = new UrlItem( strPageBaseUrl );
133             url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
134 
135             Document doc = getDocument( page, url.getUrl( ) );
136             listDocuments.add( doc );
137         }
138 
139         return listDocuments;
140     }
141 
142     /**
143      * {@inheritDoc}
144      */
145     @Override
146     public String getName( )
147     {
148         return INDEXER_NAME;
149     }
150 
151     /**
152      * {@inheritDoc}
153      */
154     @Override
155     public String getVersion( )
156     {
157         return INDEXER_VERSION;
158     }
159 
160     /**
161      * {@inheritDoc}
162      */
163     @Override
164     public String getDescription( )
165     {
166         return INDEXER_DESCRIPTION;
167     }
168 
169     /**
170      * {@inheritDoc}
171      */
172     @Override
173     public boolean isEnable( )
174     {
175         String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString( ) );
176 
177         return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) );
178     }
179 
180     /**
181      * Builds a document which will be used by Lucene during the indexing of the pages of the site with the following fields : summary, uid, url, contents,
182      * title and description.
183      * 
184      * @return the built Document
185      * @param strUrl
186      *            The base URL for documents
187      * @param page
188      *            the page to index
189      * @throws IOException
190      *             The IO Exception
191      * @throws InterruptedException
192      *             The InterruptedException
193      * @throws SiteMessageException
194      *             occurs when a site message need to be displayed
195      */
196     protected Document getDocument( Page page, String strUrl ) throws IOException, InterruptedException, SiteMessageException
197     {
198         FieldType ft = new FieldType( StringField.TYPE_STORED );
199         ft.setOmitNorms( false );
200 
201         FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
202         ftNotStored.setOmitNorms( false );
203         ftNotStored.setTokenized( false );
204 
205         FieldType ftDate = new FieldType( StringField.TYPE_STORED );
206         ftDate.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
207         ftDate.setStored( true );
208         ftDate.setOmitNorms( false );
209 
210         FieldType ftUid = ftNotStored;
211         ftUid.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
212         // make a new, empty document
213         Document doc = new Document( );
214 
215         // Add the url as a field named "url". Use an UnIndexed field, so
216         // that the url is just stored with the document, but is not searchable.
217         doc.add( new StoredField( SearchItem.FIELD_URL, strUrl ) );
218 
219         // Add the last modified date of the file a field named "modified".
220         // Use a field that is indexed (i.e. searchable), but don't tokenize
221         // the field into words.
222         String strDate = DateTools.dateToString( page.getDateUpdate( ), DateTools.Resolution.DAY );
223         doc.add( new Field( SearchItem.FIELD_DATE, strDate, ftDate ) );
224 
225         // Add the uid as a field, so that index can be incrementally maintained.
226         // This field is not stored with document, it is indexed, but it is not
227         // tokenized prior to indexing.
228         String strIdPage = String.valueOf( page.getId( ) );
229         doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftUid ) );
230 
231         String strPageContent = _pageService.getPageContent( page.getId( ), 0, null );
232         ContentHandler handler = new BodyContentHandler( );
233         Metadata metadata = new Metadata( );
234 
235         try
236         {
237             new HtmlParser( ).parse( new ByteArrayInputStream( strPageContent.getBytes( ) ), handler, metadata, new ParseContext( ) );
238         }
239         catch( TikaException | SAXException e )
240         {
241             throw new AppException( "Error during page parsing." );
242         }
243 
244         // the content of the article is recovered in the parser because this one
245         // had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
246         StringBuilder sb = new StringBuilder( handler.toString( ) );
247 
248         // Add the tag-stripped contents as a Reader-valued Text field so it will
249         // get tokenized and indexed.
250         StringBuilder sbFieldContent = new StringBuilder( );
251         StringBuilder sbFieldMetadata = new StringBuilder( );
252         sbFieldContent.append( page.getName( ) ).append( " " ).append( sb.toString( ) );
253 
254         // Add the metadata description of the page if it exists
255         if ( page.getDescription( ) != null )
256         {
257             sbFieldContent.append( " " ).append( page.getDescription( ) );
258         }
259 
260         // Add the metadata keywords of the page if it exists
261         String strMetaKeywords = page.getMetaKeywords( );
262 
263         if ( StringUtils.isNotBlank( strMetaKeywords ) )
264         {
265             sbFieldContent.append( " " ).append( strMetaKeywords );
266             sbFieldMetadata.append( strMetaKeywords );
267         }
268 
269         doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString( ), TextField.TYPE_NOT_STORED ) );
270 
271         if ( StringUtils.isNotBlank( page.getMetaDescription( ) ) )
272         {
273             if ( sbFieldMetadata.length( ) > 0 )
274             {
275                 sbFieldMetadata.append( " " );
276             }
277 
278             sbFieldMetadata.append( page.getMetaDescription( ) );
279         }
280 
281         if ( sbFieldMetadata.length( ) > 0 )
282         {
283             doc.add( new Field( SearchItem.FIELD_METADATA, sbFieldMetadata.toString( ), TextField.TYPE_NOT_STORED ) );
284         }
285 
286         // Add the title as a separate Text field, so that it can be searched
287         // separately.
288         doc.add( new Field( SearchItem.FIELD_TITLE, page.getName( ), ft ) );
289 
290         if ( StringUtils.isNotBlank( page.getDescription( ) ) )
291         {
292             // Add the summary as an UnIndexed field, so that it is stored and returned
293             // with hit documents for display.
294             doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription( ) ) );
295         }
296 
297         doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
298         doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole( ), ft ) );
299 
300         // return the document
301         return doc;
302     }
303 
304     /**
305      * {@inheritDoc}
306      */
307     @Override
308     public List<String> getListType( )
309     {
310         List<String> listType = new ArrayList<>( );
311         listType.add( INDEX_TYPE_PAGE );
312 
313         return listType;
314     }
315 
316     /**
317      * {@inheritDoc}
318      */
319     @Override
320     public String getSpecificSearchAppUrl( )
321     {
322         return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
323     }
324 }