View Javadoc
1   /*
2    * Copyright (c) 2002-2021, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.blog.service.docsearch;
35  
36  import java.io.ByteArrayInputStream;
37  import java.io.IOException;
38  import java.util.ArrayList;
39  import java.util.Date;
40  import java.util.List;
41  
42  import org.apache.lucene.document.DateTools;
43  import org.apache.lucene.document.Field;
44  import org.apache.lucene.document.FieldType;
45  import org.apache.lucene.document.NumericDocValuesField;
46  import org.apache.lucene.document.StringField;
47  import org.apache.lucene.document.TextField;
48  import org.apache.lucene.index.CorruptIndexException;
49  import org.apache.lucene.index.IndexWriter;
50  import org.apache.lucene.index.Term;
51  import org.apache.tika.exception.TikaException;
52  import org.apache.tika.metadata.Metadata;
53  import org.apache.tika.parser.ParseContext;
54  import org.apache.tika.parser.html.HtmlParser;
55  import org.apache.tika.sax.BodyContentHandler;
56  import org.xml.sax.ContentHandler;
57  import org.xml.sax.SAXException;
58  
59  import fr.paris.lutece.plugins.blog.business.Blog;
60  import fr.paris.lutece.plugins.blog.business.BlogHome;
61  import fr.paris.lutece.plugins.blog.business.IndexerAction;
62  import fr.paris.lutece.plugins.blog.business.Tag;
63  import fr.paris.lutece.plugins.blog.service.BlogPlugin;
64  import fr.paris.lutece.plugins.blog.service.BlogService;
65  import fr.paris.lutece.plugins.blog.utils.BlogUtils;
66  import fr.paris.lutece.portal.service.message.SiteMessageException;
67  import fr.paris.lutece.portal.service.plugin.PluginService;
68  import fr.paris.lutece.portal.service.search.SearchItem;
69  import fr.paris.lutece.portal.service.util.AppException;
70  import fr.paris.lutece.portal.service.util.AppPropertiesService;
71  import org.apache.lucene.document.Document;
72  
73  /**
74   * DefaultAnnounceIndexer
75   */
76  public class DefaultBlogIndexer implements IBlogSearchIndexer
77  {
78      private static final String PROPERTY_INDEXER_NAME = "blog.indexer.name";
79      private static final String ENABLE_VALUE_TRUE = "1";
80      private static final String PROPERTY_INDEXER_DESCRIPTION = "blog.indexer.description";
81      private static final String PROPERTY_INDEXER_VERSION = "blog.indexer.version";
82      private static final String PROPERTY_INDEXER_ENABLE = "blog.indexer.enable";
83      private static final String BLANK_SPACE = " ";
84  
85      /**
86       * {@inheritDoc}
87       */
88      @Override
89      public String getDescription( )
90      {
91          return AppPropertiesService.getProperty( PROPERTY_INDEXER_DESCRIPTION );
92      }
93  
94      /**
95       * Index given list of record
96       * 
97       * @param indexWriter
98       *            the indexWriter
99       * @param listIdBlog
100      *            The list of id blog
101      * @throws CorruptIndexException
102      *             If the index is corrupted
103      * @throws IOException
104      *             If an IO Exception occurred
105      */
106     private void indexListBlog( IndexWriter indexWriter, List<Integer> listIdBlog ) throws IOException
107     {
108         for ( Integer nBlogId : listIdBlog )
109         {
110             Blog blog = BlogService.getInstance( ).findByPrimaryKeyWithoutBinaries( nBlogId );
111             if ( blog != null )
112             {
113                 Document doc = getDocument( blog );
114                 indexWriter.addDocument( doc );
115             }
116         }
117     }
118 
119     /**
120      * {@inheritDoc}
121      */
122     @Override
123     public synchronized void processIndexing( IndexWriter indexWriter, boolean bCreate, StringBuilder sbLogs )
124             throws IOException, InterruptedException, SiteMessageException
125     {
126         List<Integer> listIdBlog = new ArrayList<>( );
127 
128         if ( !bCreate )
129         {
130             // incremental indexing
131             // delete all record which must be deleted
132             for ( fr.paris.lutece.plugins.blog.business.IndexerAction action : BlogSearchService.getInstance( )
133                     .getAllIndexerActionByTask( IndexerAction.TASK_DELETE ) )
134             {
135                 sbLogBlog( sbLogs, action.getIdBlog( ), IndexerAction.TASK_DELETE );
136 
137                 Term term = new Term( BlogSearchItem.FIELD_ID_HTML_DOC, Integer.toString( action.getIdBlog( ) ) );
138                 Term [ ] terms = {
139                         term
140                 };
141 
142                 indexWriter.deleteDocuments( terms );
143                 BlogSearchService.getInstance( ).removeIndexerAction( action.getIdAction( ) );
144             }
145 
146             // Update all record which must be updated
147             for ( IndexerAction action : BlogSearchService.getInstance( ).getAllIndexerActionByTask( IndexerAction.TASK_MODIFY ) )
148             {
149                 sbLogBlog( sbLogs, action.getIdBlog( ), IndexerAction.TASK_MODIFY );
150 
151                 Term term = new Term( BlogSearchItem.FIELD_ID_HTML_DOC, Integer.toString( action.getIdBlog( ) ) );
152                 Term [ ] terms = {
153                         term
154                 };
155 
156                 indexWriter.deleteDocuments( terms );
157                 listIdBlog = new ArrayList<>( );
158                 listIdBlog.add( action.getIdBlog( ) );
159                 this.indexListBlog( indexWriter, listIdBlog );
160                 BlogSearchService.getInstance( ).removeIndexerAction( action.getIdAction( ) );
161             }
162 
163             listIdBlog = new ArrayList<>( );
164 
165             // add all record which must be added
166             for ( IndexerAction action : BlogSearchService.getInstance( ).getAllIndexerActionByTask( IndexerAction.TASK_CREATE ) )
167             {
168                 sbLogBlog( sbLogs, action.getIdBlog( ), IndexerAction.TASK_CREATE );
169                 listIdBlog.add( action.getIdBlog( ) );
170 
171                 BlogSearchService.getInstance( ).removeIndexerAction( action.getIdAction( ) );
172             }
173 
174             indexListBlog( indexWriter, listIdBlog );
175         }
176         else
177         {
178             for ( Blog doc : BlogHome.getBlogsList( ) )
179             {
180 
181                 sbLogs.append( "Indexing Blog" );
182                 sbLogs.append( "\r\n" );
183 
184                 sbLogBlog( sbLogs, doc.getId( ), IndexerAction.TASK_CREATE );
185 
186                 listIdBlog.add( doc.getId( ) );
187 
188             }
189 
190             indexListBlog( indexWriter, listIdBlog );
191         }
192 
193         indexWriter.commit( );
194     }
195 
196     /**
197      * Builds a document which will be used by Lucene during the indexing of the announces list
198      * 
199      * @param blog
200      *            The blog post
201      * @throws IOException
202      *             If an IO Exception occurred
203      * @return the document
204      */
205     public static org.apache.lucene.document.Document getDocument( Blog blog ) throws IOException
206     {
207         // make a new, empty document
208         org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document( );
209 
210         doc.add( new StringField( BlogSearchItem.FIELD_ID_HTML_DOC, Integer.toString( blog.getId( ) ), Field.Store.YES ) );
211         // Add the user firstName as a field, so that index can be incrementally maintained.
212         doc.add( new StringField( BlogSearchItem.FIELD_USER, blog.getUserCreator( ).toLowerCase( ), Field.Store.YES ) );
213 
214         doc.add( new TextField( BlogSearchItem.FIELD_TAGS, getTagToIndex( blog ), Field.Store.YES ) );
215         doc.add( new TextField( BlogSearchItem.FIELD_USERS_EDITED_BLOG, getUsersEditedBlogVersions( blog ), Field.Store.YES ) );
216 
217         FieldType ft = new FieldType( StringField.TYPE_STORED );
218         ft.setOmitNorms( false );
219         doc.add( new Field( SearchItem.FIELD_DATE, DateTools.timeToString( blog.getUpdateDate( ).getTime( ), DateTools.Resolution.MINUTE ), ft ) );
220         doc.add( new NumericDocValuesField( BlogSearchItem.FIELD_DATE_UPDATE, blog.getUpdateDate( ).getTime( ) ) );
221         // is document published TODAY
222         Date today = new Date( );
223         boolean isPublished = blog.getBlogPublication( ).stream( )
224                 .anyMatch( publication -> today.after( publication.getDateBeginPublishing( ) ) && today.before( publication.getDateEndPublishing( ) ) );
225         doc.add( new TextField( BlogSearchItem.FIELD_UNPUBLISHED, ( isPublished ) ? "false" : "true", Field.Store.YES ) );
226 
227         // Add the uid as a field, so that index can be incrementally maintained.
228         // This field is not stored with question/answer, it is indexed, but it is not
229         // tokenized prior to indexing.
230         String strIdAnnounce = String.valueOf( blog.getId( ) );
231         doc.add( new StringField( SearchItem.FIELD_UID, strIdAnnounce, Field.Store.YES ) );
232 
233         String strContentToIndex = getContentToIndex( blog );
234         // NOUVEAU
235         ContentHandler handler = new BodyContentHandler( -1 );
236         Metadata metadata = new Metadata( );
237 
238         try
239         {
240             new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata, new ParseContext( ) );
241         }
242         catch( TikaException | SAXException e )
243         {
244             throw new AppException( "Error during blog parsing. blog Id: " + blog.getId( ), e );
245         }
246 
247         String strContent = handler.toString( );
248 
249         // Add the tag-stripped contents as a Reader-valued Text field so it will
250         // get tokenized and indexed.
251         doc.add( new TextField( SearchItem.FIELD_CONTENTS, strContent, Field.Store.NO ) );
252 
253         doc.add( new TextField( SearchItem.FIELD_SUMMARY, blog.getHtmlContent( ), Field.Store.YES ) );
254         // Add the subject name as a separate Text field, so that it can be searched
255         // separately.
256         doc.add( new StringField( SearchItem.FIELD_TITLE, blog.getName( ), Field.Store.YES ) );
257 
258         doc.add( new StringField( SearchItem.FIELD_TYPE, BlogPlugin.PLUGIN_NAME, Field.Store.YES ) );
259 
260         // return the document
261         return doc;
262     }
263 
264     /**
265      * Set the Content to index
266      * 
267      * @param blog
268      *            The {@link blog} to index
269      * @return The content to index
270      */
271     private static String getContentToIndex( Blog blog )
272     {
273         StringBuilder sbContentToIndex = new StringBuilder( );
274         // Do not index question here
275         sbContentToIndex.append( blog.getName( ) );
276         sbContentToIndex.append( BLANK_SPACE );
277         sbContentToIndex.append( blog.getDescription( ) );
278         sbContentToIndex.append( BLANK_SPACE );
279         sbContentToIndex.append( blog.getHtmlContent( ) );
280         sbContentToIndex.append( BLANK_SPACE );
281         sbContentToIndex.append( blog.getId( ) );
282 
283         return sbContentToIndex.toString( );
284     }
285 
286     /**
287      * Set the tag to index
288      * 
289      * @param blog
290      *            The {@link blog} to index
291      * @return The tag to index
292      */
293     private static String getTagToIndex( Blog blog )
294     {
295         StringBuilder sbContentToIndex = new StringBuilder( );
296 
297         for ( Tag tg : blog.getTag( ) )
298         {
299             sbContentToIndex.append( BLANK_SPACE );
300             sbContentToIndex.append( tg.getIdTag( ) );
301         }
302 
303         return sbContentToIndex.toString( );
304     }
305 
306     /**
307      * Set the user list edited the blog
308      * 
309      * @param blog
310      *            The blog to index
311      * @return The list of users
312      */
313     private static String getUsersEditedBlogVersions( Blog blog )
314     {
315         StringBuilder sbContentToIndex = new StringBuilder( );
316         List<String> usersList = BlogHome.getUsersEditedBlogVersions( blog.getId( ) );
317 
318         for ( String user : usersList )
319         {
320             sbContentToIndex.append( BLANK_SPACE );
321             sbContentToIndex.append( user );
322         }
323 
324         return sbContentToIndex.toString( );
325     }
326 
327     /**
328      * {@inheritDoc}
329      */
330     @Override
331     public String getName( )
332     {
333         return AppPropertiesService.getProperty( PROPERTY_INDEXER_NAME );
334     }
335 
336     /**
337      * {@inheritDoc}
338      */
339     @Override
340     public String getVersion( )
341     {
342         return AppPropertiesService.getProperty( PROPERTY_INDEXER_VERSION );
343     }
344 
345     /**
346      * {@inheritDoc}
347      */
348     @Override
349     public boolean isEnable( )
350     {
351         boolean bReturn = false;
352         String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE );
353 
354         if ( ( strEnable != null ) && ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) || strEnable.equals( ENABLE_VALUE_TRUE ) )
355                 && PluginService.isPluginEnable( BlogPlugin.PLUGIN_NAME ) )
356         {
357             bReturn = true;
358         }
359 
360         return bReturn;
361     }
362 
363     /**
364      * Indexing action performed on the recording
365      * 
366      * @param sbLogs
367      *            the buffer log
368      * @param nIdBlog
369      *            the id of the Blog
370      * @param nAction
371      *            the indexer action key performed
372      */
373     private void sbLogBlog( StringBuilder sbLogs, int nIdBlog, int nAction )
374     {
375         sbLogs.append( "Indexing Blogs:" );
376 
377         switch( nAction )
378         {
379             case IndexerAction.TASK_CREATE:
380                 sbLogs.append( "Insert " );
381 
382                 break;
383 
384             case IndexerAction.TASK_MODIFY:
385                 sbLogs.append( "Modify " );
386 
387                 break;
388 
389             case IndexerAction.TASK_DELETE:
390                 sbLogs.append( "Delete " );
391 
392                 break;
393 
394             default:
395                 break;
396         }
397 
398         if ( nIdBlog != BlogUtils.CONSTANT_ID_NULL )
399         {
400             sbLogs.append( "id_blog=" );
401             sbLogs.append( nIdBlog );
402         }
403 
404         sbLogs.append( "\r\n" );
405     }
406 
407 }