View Javadoc
1   /*
2    * Copyright (c) 2002-2013, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.calendar.modules.document.service.search;
35  
36  import fr.paris.lutece.plugins.calendar.business.Event;
37  import fr.paris.lutece.plugins.calendar.business.OccurrenceEvent;
38  import fr.paris.lutece.plugins.calendar.service.AgendaResource;
39  import fr.paris.lutece.plugins.calendar.service.CalendarPlugin;
40  import fr.paris.lutece.plugins.calendar.service.Utils;
41  import fr.paris.lutece.plugins.calendar.web.Constants;
42  import fr.paris.lutece.plugins.document.business.DocumentFilter;
43  import fr.paris.lutece.plugins.document.business.DocumentHome;
44  import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
45  import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
46  import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
47  import fr.paris.lutece.portal.service.content.XPageAppService;
48  import fr.paris.lutece.portal.service.i18n.I18nService;
49  import fr.paris.lutece.portal.service.message.SiteMessageException;
50  import fr.paris.lutece.portal.service.plugin.PluginService;
51  import fr.paris.lutece.portal.service.search.IndexationService;
52  import fr.paris.lutece.portal.service.search.SearchIndexer;
53  import fr.paris.lutece.portal.service.search.SearchItem;
54  import fr.paris.lutece.portal.service.spring.SpringContextService;
55  import fr.paris.lutece.portal.service.util.AppException;
56  import fr.paris.lutece.portal.service.util.AppLogService;
57  import fr.paris.lutece.portal.service.util.AppPathService;
58  import fr.paris.lutece.portal.service.util.AppPropertiesService;
59  import fr.paris.lutece.util.url.UrlItem;
60  
61  import java.io.ByteArrayInputStream;
62  import java.io.IOException;
63  import java.util.ArrayList;
64  import java.util.Collections;
65  import java.util.List;
66  
67  import org.apache.commons.lang.StringUtils;
68  import org.apache.lucene.document.Document;
69  import org.apache.lucene.document.Field;
70  import org.apache.lucene.document.FieldType;
71  import org.apache.lucene.document.StringField;
72  import org.apache.lucene.document.TextField;
73  import org.apache.lucene.index.IndexOptions;
74  import org.apache.tika.exception.TikaException;
75  import org.apache.tika.metadata.Metadata;
76  import org.apache.tika.parser.ParseContext;
77  import org.apache.tika.parser.html.HtmlParser;
78  import org.apache.tika.sax.BodyContentHandler;
79  import org.xml.sax.ContentHandler;
80  import org.xml.sax.SAXException;
81  
82  
83  public class DocumentCalendarIndexer implements SearchIndexer
84  {
85      //properties
86      private static final String PROPERTY_INDEXER_NAME = "calendar-document.indexer.name";
87      private static final String ENABLE_VALUE_TRUE = "1";
88      private static final String PROPERTY_INDEXER_DESCRIPTION = "calendar-document.indexer.description";
89      private static final String PROPERTY_INDEXER_VERSION = "calendar-document.indexer.version";
90      private static final String PROPERTY_INDEXER_ENABLE = "calendar-document.indexer.enable";
91      private static final String PROPERTY_DOCUMENT_CALENDAR_TYPE = "calendar-document.calendar.document.type";
92      private static final String PROPERTY_DOCUMENT_SHORT_NAME = "dcld";
93      private static final String CALENDAR_SHORT_NAME = "cld";
94      /** uses calendar search page */
95      private static final String JSP_SEARCH_CALENDAR = "jsp/site/Portal.jsp?page=calendar&action=search";
96      private static IFileIndexerFactory _factoryIndexer = (IFileIndexerFactory) SpringContextService
97              .getBean( IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY );
98  
99      /**
100      * Index all documents
101      * 
102      * @throws IOException the exception
103      * @throws InterruptedException the exception
104      * @throws SiteMessageException the exception
105      */
106     public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
107     {
108         String sRoleKey = "";
109 
110         DocumentFilter docFilter = new DocumentFilter( );
111         docFilter.setCodeDocumentType( AppPropertiesService.getProperty( PROPERTY_DOCUMENT_CALENDAR_TYPE ) );
112 
113         for ( fr.paris.lutece.plugins.document.business.Document document : DocumentHome.findByFilter( docFilter,
114                 I18nService.getDefaultLocale( ) ) )
115         {
116             for ( AgendaResource agenda : Utils.getAgendaResourcesWithOccurrences( ) )
117             {
118                 sRoleKey = agenda.getRole( );
119 
120                 String strAgenda = agenda.getId( );
121 
122                 for ( Object oOccurrence : agenda.getAgenda( ).getEvents( ) )
123                 {
124                     OccurrenceEvent occurrence = (OccurrenceEvent) oOccurrence;
125 
126                     if ( occurrence.getDocumentId( ) == document.getId( ) )
127                     {
128                         indexSubject( document, sRoleKey, occurrence, strAgenda );
129                     }
130                 }
131             }
132         }
133     }
134 
135     /**
136      * Recursive method for indexing a calendar event
137      * 
138      * @param faq the faq linked to the subject
139      * @param subject the subject
140      * @throws IOException I/O Exception
141      * @throws InterruptedException interruptedException
142      */
143     public void indexSubject( fr.paris.lutece.plugins.document.business.Document document, String sRoleKey,
144             OccurrenceEvent occurrence, String strAgenda ) throws IOException, InterruptedException
145     {
146         String strPortalUrl = AppPathService.getPortalUrl( );
147 
148         UrlItem urlEvent = new UrlItem( strPortalUrl );
149         urlEvent.addParameter( XPageAppService.PARAM_XPAGE_APP, CalendarPlugin.PLUGIN_NAME );
150         urlEvent.addParameter( Constants.PARAMETER_ACTION, Constants.ACTION_SHOW_RESULT );
151         urlEvent.addParameter( Constants.PARAMETER_EVENT_ID, occurrence.getEventId( ) );
152         urlEvent.addParameter( Constants.PARAMETER_DOCUMENT_ID, document.getId( ) );
153         urlEvent.addParameter( Constants.PARAM_AGENDA, strAgenda );
154 
155         org.apache.lucene.document.Document docSubject = null;
156         try
157         {
158             docSubject = getDocument( document, sRoleKey, occurrence, strAgenda, urlEvent.getUrl( ) );
159         }
160         catch ( Exception e )
161         {
162             String strMessage = "Document ID : " + document.getId( ) + " - Agenda ID : " + strAgenda
163                     + " - Occurrence ID " + occurrence.getId( );
164             IndexationService.error( this, e, strMessage );
165         }
166         if ( docSubject != null )
167         {
168             IndexationService.write( docSubject );
169         }
170     }
171 
172     /**
173      * Get the calendar document
174      * @param strDocument id of the subject to index
175      * @return The list of lucene documents
176      * @throws IOException the exception
177      * @throws InterruptedException the exception
178      * @throws SiteMessageException the exception
179      */
180     public List<Document> getDocuments( String strDocument ) throws IOException, InterruptedException,
181             SiteMessageException
182     {
183         List<org.apache.lucene.document.Document> listDocs = new ArrayList<org.apache.lucene.document.Document>( );
184         String sRoleKey = "";
185         DocumentFilter docFilter = new DocumentFilter( );
186         docFilter.setCodeDocumentType( AppPropertiesService.getProperty( PROPERTY_DOCUMENT_CALENDAR_TYPE ) );
187 
188         for ( fr.paris.lutece.plugins.document.business.Document document : DocumentHome.findByFilter( docFilter,
189                 I18nService.getDefaultLocale( ) ) )
190         {
191             for ( AgendaResource agenda : Utils.getAgendaResourcesWithOccurrences( ) )
192             {
193                 sRoleKey = agenda.getRole( );
194 
195                 String strAgenda = agenda.getId( );
196 
197                 for ( Object oOccurrence : agenda.getAgenda( ).getEvents( ) )
198                 {
199                     OccurrenceEvent occurrence = (OccurrenceEvent) oOccurrence;
200 
201                     if ( occurrence.getDocumentId( ) == document.getId( ) )
202                     {
203                         String strPortalUrl = AppPathService.getPortalUrl( );
204 
205                         UrlItem urlEvent = new UrlItem( strPortalUrl );
206                         urlEvent.addParameter( XPageAppService.PARAM_XPAGE_APP, CalendarPlugin.PLUGIN_NAME );
207                         urlEvent.addParameter( Constants.PARAMETER_ACTION, Constants.ACTION_SHOW_RESULT );
208                         urlEvent.addParameter( Constants.PARAMETER_EVENT_ID, occurrence.getEventId( ) );
209                         urlEvent.addParameter( Constants.PARAMETER_DOCUMENT_ID, document.getId( ) );
210                         urlEvent.addParameter( Constants.PARAM_AGENDA, strAgenda );
211 
212                         org.apache.lucene.document.Document doc = getDocument( document, sRoleKey, occurrence,
213                                 strAgenda, urlEvent.getUrl( ) );
214                         listDocs.add( doc );
215                         ;
216                     }
217                 }
218             }
219         }
220 
221         return listDocs;
222     }
223 
224     /**
225      * Returns the indexer service name
226      * @return the indexer service name
227      */
228     public String getName( )
229     {
230         return AppPropertiesService.getProperty( PROPERTY_INDEXER_NAME );
231     }
232 
233     /**
234      * Returns the indexer service version
235      * @return the indexer service version
236      */
237     public String getVersion( )
238     {
239         return AppPropertiesService.getProperty( PROPERTY_INDEXER_VERSION );
240     }
241 
242     /**
243      * Returns the indexer service description
244      * @return the indexer service description
245      */
246     public String getDescription( )
247     {
248         return AppPropertiesService.getProperty( PROPERTY_INDEXER_DESCRIPTION );
249     }
250 
251     /**
252      * Tells whether the service is enable or not
253      * @return true if enable, otherwise false
254      */
255     public boolean isEnable( )
256     {
257         boolean bReturn = false;
258         String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE );
259 
260         if ( ( strEnable != null )
261                 && ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) || strEnable.equals( ENABLE_VALUE_TRUE ) )
262                 && PluginService.isPluginEnable( CalendarPlugin.PLUGIN_NAME ) )
263         {
264             bReturn = true;
265         }
266 
267         return bReturn;
268     }
269 
270     /**
271      * Builds a document which will be used by Lucene during the indexing of the
272      * pages of the site with the following
273      * fields : summary, uid, url, contents, title and description.
274      * 
275      * @param document the document to index
276      * @param strUrl the url of the documents
277      * @param strRole the lutece role of the page associate to the document
278      * @param strPortletDocumentId the document id concatened to the id portlet
279      *            with a & in the middle
280      * @return the built Document
281      * @throws IOException The IO Exception
282      * @throws InterruptedException The InterruptedException
283      */
284     public static org.apache.lucene.document.Document getDocument(
285             fr.paris.lutece.plugins.document.business.Document document, String strRole, Event occurrence,
286             String strAgenda, String strOccurrenceUrl ) throws IOException, InterruptedException
287     {
288         FieldType ft = new FieldType( StringField.TYPE_STORED );
289         ft.setOmitNorms( false );
290 
291         FieldType ftNotStored = new FieldType( StringField.TYPE_STORED );
292         ftNotStored.setOmitNorms( false );
293 
294         FieldType ftNo = new FieldType( StringField.TYPE_STORED );
295         ftNo.setIndexOptions( IndexOptions.NONE );
296         ftNo.setTokenized( false );
297         ftNo.setOmitNorms( false );
298 
299         // make a new, empty document
300         org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document( );
301 
302         doc.add( new Field( Constants.FIELD_CALENDAR_ID, strAgenda + "_" + CALENDAR_SHORT_NAME, ftNotStored ) );
303 
304         // Add the last modified date of the file a field named "modified".
305         // Use a field that is indexed (i.e. searchable), but don't tokenize
306         // the field into words.
307         String strDate = Utils.getDate( occurrence.getDate( ) );
308         doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
309 
310         // Add the url as a field named "url".  Use an UnIndexed field, so
311         // that the url is just stored with the question/answer, but is not searchable.
312         doc.add( new Field( SearchItem.FIELD_URL, strOccurrenceUrl, ft ) );
313 
314         // Add the uid as a field, so that index can be incrementally maintained.
315         // This field is not stored with document, it is indexed, but it is not
316         // tokenized prior to indexing.
317         String strOccurrenceId = String.valueOf( occurrence.getId( ) );
318         doc.add( new Field( SearchItem.FIELD_UID, strOccurrenceId + "_" + PROPERTY_DOCUMENT_SHORT_NAME, ft ) );
319 
320         String strContentToIndex = getContentToIndex( document );
321         ContentHandler handler = new BodyContentHandler( );
322         Metadata metadata = new Metadata( );
323         try
324         {
325             new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata,
326                     new ParseContext( ) );
327         }
328         catch ( SAXException e )
329         {
330             throw new AppException( "Error during page parsing." );
331         }
332         catch ( TikaException e )
333         {
334             throw new AppException( "Error during page parsing." );
335         }
336 
337         //the content of the article is recovered in the parser because this one
338         //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
339         StringBuilder sb = new StringBuilder( handler.toString( ) );
340 
341         // Add the tag-stripped contents as a Reader-valued Text field so it will
342         // get tokenized and indexed.
343         doc.add( new Field( SearchItem.FIELD_CONTENTS, sb.toString( ), TextField.TYPE_NOT_STORED ) );
344 
345         // Add the title as a separate Text field, so that it can be searched
346         // separately.
347         doc.add( new Field( SearchItem.FIELD_TITLE, document.getTitle( ), ftNo ) );
348 
349         doc.add( new Field( SearchItem.FIELD_TYPE, CalendarPlugin.PLUGIN_NAME, TextField.TYPE_STORED ) );
350 
351         doc.add( new Field( SearchItem.FIELD_ROLE, strRole, ft ) );
352 
353         // return the document
354         return doc;
355     }
356 
357     /**
358      * Get the content from the document
359      * @param document the document to index
360      * @return the content
361      */
362     private static String getContentToIndex( fr.paris.lutece.plugins.document.business.Document document )
363     {
364         StringBuffer sbContentToIndex = new StringBuffer( );
365         sbContentToIndex.append( document.getTitle( ) );
366 
367         for ( DocumentAttribute attribute : document.getAttributes( ) )
368         {
369             if ( attribute.isSearchable( ) )
370             {
371                 if ( !attribute.isBinary( ) )
372                 {
373                     // Text attributes
374                     sbContentToIndex.append( attribute.getTextValue( ) );
375                     sbContentToIndex.append( " " );
376                 }
377                 else
378                 {
379                     // Binary file attribute
380                     // Gets indexer depending on the ContentType (ie: "application/pdf" should use a PDF indexer)
381                     IFileIndexer indexer = _factoryIndexer.getIndexer( attribute.getValueContentType( ) );
382 
383                     if ( indexer != null )
384                     {
385                         try
386                         {
387                             ByteArrayInputStream bais = new ByteArrayInputStream( attribute.getBinaryValue( ) );
388                             sbContentToIndex.append( indexer.getContentToIndex( bais ) );
389                             sbContentToIndex.append( " " );
390                             bais.close( );
391                         }
392                         catch ( IOException e )
393                         {
394                             AppLogService.error( e.getMessage( ), e );
395                         }
396                     }
397                 }
398             }
399         }
400 
401         // Index Metadata
402         sbContentToIndex.append( document.getXmlMetadata( ) );
403 
404         return sbContentToIndex.toString( );
405     }
406 
407     /**
408      * Defined by Calendar indexer.
409      */
410     public List<String> getListType( )
411     {
412         return Collections.emptyList( );
413     }
414 
415     /**
416      * Defined by Calendar indexer.
417      */
418     public String getSpecificSearchAppUrl( )
419     {
420         return StringUtils.EMPTY;
421     }
422 }