View Javadoc
1   /*
2    * Copyright (c) 2002-2020, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.document.service.search;
35  
36  import fr.paris.lutece.plugins.document.business.Document;
37  import fr.paris.lutece.plugins.document.business.DocumentHome;
38  import fr.paris.lutece.plugins.document.business.DocumentTypeHome;
39  import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
40  import fr.paris.lutece.plugins.document.business.portlet.DocumentListPortletHome;
41  import fr.paris.lutece.plugins.document.service.publishing.PublishingService;
42  import fr.paris.lutece.plugins.document.utils.IntegerUtils;
43  import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
44  import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
45  import fr.paris.lutece.portal.business.page.Page;
46  import fr.paris.lutece.portal.business.page.PageHome;
47  import fr.paris.lutece.portal.business.portlet.Portlet;
48  import fr.paris.lutece.portal.business.portlet.PortletHome;
49  import fr.paris.lutece.portal.service.search.IndexationService;
50  import fr.paris.lutece.portal.service.search.SearchIndexer;
51  import fr.paris.lutece.portal.service.search.SearchItem;
52  import fr.paris.lutece.portal.service.spring.SpringContextService;
53  import fr.paris.lutece.portal.service.util.AppException;
54  import fr.paris.lutece.portal.service.util.AppLogService;
55  import fr.paris.lutece.portal.service.util.AppPropertiesService;
56  import fr.paris.lutece.util.ReferenceItem;
57  import fr.paris.lutece.util.url.UrlItem;
58  
59  import org.apache.commons.lang3.StringUtils;
60  
61  import org.apache.lucene.document.DateTools;
62  import org.apache.lucene.document.Field;
63  import org.apache.lucene.document.FieldType;
64  import org.apache.lucene.document.StoredField;
65  import org.apache.lucene.document.StringField;
66  import org.apache.lucene.document.TextField;
67  
68  import org.apache.tika.exception.TikaException;
69  import org.apache.tika.metadata.Metadata;
70  import org.apache.tika.parser.ParseContext;
71  import org.apache.tika.parser.html.HtmlParser;
72  import org.apache.tika.sax.BodyContentHandler;
73  
74  import org.xml.sax.ContentHandler;
75  import org.xml.sax.SAXException;
76  
77  import java.io.ByteArrayInputStream;
78  import java.io.IOException;
79  
80  import java.util.ArrayList;
81  import java.util.Iterator;
82  import java.util.List;
83  
84  
85  /**
86   * Document Indexer
87   */
88  public class DocumentIndexer implements SearchIndexer
89  {
90      public static final String INDEXER_NAME = "DocumentIndexer";
91      public static final String SHORT_NAME = "dcm";
92      private static final String INDEXER_DESCRIPTION = "Indexer service for documents";
93      private static final String INDEXER_VERSION = "1.0.0";
94      private static final String PROPERTY_PAGE_BASE_URL = "document.documentIndexer.baseUrl";
95      private static final String PROPERTY_INDEXER_ENABLE = "document.documentIndexer.enable";
96      private static final String PARAMETER_DOCUMENT_ID = "document_id";
97      private static final String PARAMETER_PORTLET_ID = "portlet_id";
98      private static final String JSP_PAGE_ADVANCED_SEARCH = "jsp/site/Portal.jsp?page=advanced_search";
99      private static final String PROPERTY_WRITER_MAX_FIELD_LENGTH = "search.lucene.writer.maxFieldLength"; // from the core
100     private static final int DEFAULT_WRITER_MAX_FIELD_LENGTH = 1000000;
101 
102 
103     /**
104      * index all lucene documents
105      * @throws java.io.IOException i/o exception
106      * @throws java.lang.InterruptedException interrupted exception
107      */
108     @Override
109     public void indexDocuments(  ) throws IOException, InterruptedException
110     {
111         String strBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
112         Page page;
113 
114         for ( Portlet portlet : PortletHome.findByType( DocumentListPortletHome.getInstance(  ).getPortletTypeId(  ) ) )
115         {
116             page = PageHome.getPage( portlet.getPageId(  ) );
117 
118             for ( Document d : PublishingService.getInstance(  ).getPublishedDocumentsByPortletId( portlet.getId(  ) ) )
119             {
120                 Document document = DocumentHome.findByPrimaryKey( d.getId(  ) );
121 
122                 // Reload the full object to get all its searchable attributes
123                 UrlItem url = new UrlItem( strBaseUrl );
124                 url.addParameter( PARAMETER_DOCUMENT_ID, document.getId(  ) );
125                 url.addParameter( PARAMETER_PORTLET_ID, portlet.getId(  ) );
126 
127                 String strPortletDocumentId = document.getId(  ) + "_" + SHORT_NAME + "&" + portlet.getId(  );
128                 org.apache.lucene.document.Document doc = null;
129 
130                 try
131                 {
132                     doc = getDocument( document, url.getUrl(  ), page.getRole(  ), strPortletDocumentId );
133                 }
134                 catch ( Exception e )
135                 {
136                     String strMessage = "Document ID : " + document.getId(  ) + " - Portlet ID : " + portlet.getId(  );
137                     IndexationService.error( this, e, strMessage );
138                 }
139 
140                 if ( doc != null )
141                 {
142                     IndexationService.write( doc );
143                 }
144             }
145         }
146     }
147 
148     /**
149      * Returns a collection of lucene documents with the same id
150      * @param strIdDocument the document id
151      * @return lucene documents
152      * @throws IOexception i/o exception
153      * @throws InterruptedException interrupted exception
154      */
155     @Override
156     public List<org.apache.lucene.document.Document> getDocuments( String strIdDocument )
157         throws IOException, InterruptedException
158     {
159         List<org.apache.lucene.document.Document> listDocs = new ArrayList<org.apache.lucene.document.Document>(  );
160         int nIdDocument = IntegerUtils.convert( strIdDocument );
161         Document document = DocumentHome.findByPrimaryKey( nIdDocument );
162         Iterator<Portlet> it = PublishingService.getInstance(  ).getPortletsByDocumentId( strIdDocument ).iterator(  );
163         String strBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
164         Page page;
165 
166         while ( it.hasNext(  ) )
167         {
168             Portlet portlet = it.next(  );
169             UrlItem url = new UrlItem( strBaseUrl );
170             url.addParameter( PARAMETER_DOCUMENT_ID, nIdDocument );
171             url.addParameter( PARAMETER_PORTLET_ID, portlet.getId(  ) );
172 
173             String strPortletDocumentId = nIdDocument + "_" + SHORT_NAME + "&" + portlet.getId(  );
174 
175             page = PageHome.getPage( portlet.getPageId(  ) );
176 
177             org.apache.lucene.document.Document doc = getDocument( document, url.getUrl(  ), page.getRole(  ),
178                     strPortletDocumentId );
179             listDocs.add( doc );
180         }
181 
182         return listDocs;
183     }
184 
185     /**
186      * Returns the indexer service name
187      * @return the indexer service name
188      */
189     @Override
190     public String getName(  )
191     {
192         return INDEXER_NAME;
193     }
194 
195     /**
196      * Returns the indexer service version
197      * @return The indexer service version
198      */
199     @Override
200     public String getVersion(  )
201     {
202         return INDEXER_VERSION;
203     }
204 
205     /**
206      * Returns the indexer service description
207      * @return The indexer service description
208      */
209     @Override
210     public String getDescription(  )
211     {
212         return INDEXER_DESCRIPTION;
213     }
214 
215     /**
216      * Tells whether the service is enable or not
217      * @return true if enable, otherwise false
218      */
219     @Override
220     public boolean isEnable(  )
221     {
222         String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, "true" );
223 
224         return ( strEnable.equalsIgnoreCase( "true" ) );
225     }
226 
227     /**
228      * Builds a document which will be used by Lucene during the indexing of the
229      * pages of the site with the following
230      * fields : summary, uid, url, contents, title and description.
231      *
232      * @param document the document to index
233      * @param strUrl the url of the documents
234      * @param strRole the lutece role of the page associate to the document
235      * @param strPortletDocumentId the document id concatened to the id portlet
236      *            with a & in the middle
237      * @return the built Document
238      * @throws IOException The IO Exception
239      * @throws InterruptedException The InterruptedException
240      */
241     public static org.apache.lucene.document.Document getDocument( Document document, String strUrl, String strRole,
242         String strPortletDocumentId ) throws IOException, InterruptedException
243     {
244         // make a new, empty document
245         org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(  );
246 
247         FieldType ft = new FieldType( StringField.TYPE_STORED );
248         ft.setOmitNorms( false );
249 
250         // Add the url as a field named "url".  Use an UnIndexed field, so
251         // that the url is just stored with the document, but is not searchable.
252         doc.add( new Field( SearchItem.FIELD_URL, strUrl, ft ) );
253 
254         // Add the PortletDocumentId as a field named "document_portlet_id".  
255         doc.add( new Field( SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft ) );
256 
257         // Add the last modified date of the file a field named "modified".
258         // Use a field that is indexed (i.e. searchable), but don't tokenize
259         // the field into words.
260         String strDate = DateTools.dateToString( document.getDateModification(  ), DateTools.Resolution.DAY );
261         doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
262 
263         // Add the uid as a field, so that index can be incrementally maintained.
264         // This field is not stored with document, it is indexed, but it is not
265         // tokenized prior to indexing.
266         String strIdDocument = String.valueOf( document.getId(  ) );
267         doc.add( new Field( SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft ) );
268 
269         String strContentToIndex = getContentToIndex( document );
270         int nWriteLimit = AppPropertiesService.getPropertyInt( PROPERTY_WRITER_MAX_FIELD_LENGTH, DEFAULT_WRITER_MAX_FIELD_LENGTH );
271         ContentHandler handler = new BodyContentHandler( nWriteLimit );
272         Metadata metadata = new Metadata(  );
273 
274         try
275         {
276             new HtmlParser(  ).parse( new ByteArrayInputStream( strContentToIndex.getBytes(  ) ), handler, metadata,
277                 new ParseContext(  ) );
278         }
279         catch ( SAXException e )
280         {
281             throw new AppException( "Error during document parsing.", e );
282         }
283         catch ( TikaException e )
284         {
285             throw new AppException( "Error during document parsing.", e );
286         }
287 
288         //the content of the article is recovered in the parser because this one
289         //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
290         String strContent = handler.toString(  );
291 
292         // Add the tag-stripped contents as a Reader-valued Text field so it will
293         // get tokenized and indexed.
294         doc.add( new Field( SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED ) );
295 
296         // Add the title as a separate Text field, so that it can be searched
297         // separately.
298         FieldType ft2 = new FieldType( TextField.TYPE_STORED );
299         ft2.setOmitNorms( true );
300         doc.add( new Field( SearchItem.FIELD_TITLE, document.getTitle(  ), ft2 ) );
301 
302         doc.add( new Field( SearchItem.FIELD_TYPE, document.getType(  ), ft ) );
303 
304         doc.add( new Field( SearchItem.FIELD_ROLE, strRole, ft ) );
305 
306         // add metadata (mapped to summary)
307         doc.add( new Field( SearchItem.FIELD_METADATA, document.getSummary(  ), TextField.TYPE_NOT_STORED ) );
308         doc.add( new StoredField( SearchItem.FIELD_SUMMARY, document.getSummary(  ) ) );
309 
310         // return the document
311         return doc;
312     }
313 
314     /**
315      * Get the content from the document
316      * @param document the document to index
317      * @return the content
318      */
319     private static String getContentToIndex( Document document )
320     {
321         StringBuilder sbContentToIndex = new StringBuilder(  );
322         sbContentToIndex.append( document.getTitle(  ) );
323 
324         for ( DocumentAttribute attribute : document.getAttributes(  ) )
325         {
326             if ( attribute.isSearchable(  ) )
327             {
328                 if ( !attribute.isBinary(  ) )
329                 {
330                     // Text attributes
331                     sbContentToIndex.append( " " );
332                     sbContentToIndex.append( attribute.getTextValue(  ) );
333                 }
334                 else
335                 {
336                     // Binary file attribute
337                     // Gets indexer depending on the ContentType (ie: "application/pdf" should use a PDF indexer)
338                     IFileIndexerFactory factoryIndexer = (IFileIndexerFactory) SpringContextService.getBean( IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY );
339                     IFileIndexer indexer = factoryIndexer.getIndexer( attribute.getValueContentType(  ) );
340 
341                     if ( indexer != null )
342                     {
343                         try
344                         {
345                             ByteArrayInputStream bais = new ByteArrayInputStream( attribute.getBinaryValue(  ) );
346                             sbContentToIndex.append( " " );
347                             sbContentToIndex.append( indexer.getContentToIndex( bais ) );
348                             bais.close(  );
349                         }
350                         catch ( IOException e )
351                         {
352                             AppLogService.error( e.getMessage(  ), e );
353                         }
354                     }
355                 }
356             }
357         }
358 
359         // Index Metadata
360         sbContentToIndex.append( " " );
361         sbContentToIndex.append( StringUtils.defaultString( document.getXmlMetadata(  ) ) );
362 
363         return sbContentToIndex.toString(  );
364     }
365 
366     /**
367      * {@inheritDoc}
368      */
369     @Override
370     public List<String> getListType(  )
371     {
372         List<String> typeList = new ArrayList<String>(  );
373 
374         for ( ReferenceItem item : DocumentTypeHome.getDocumentTypesList(  ) )
375         {
376             typeList.add( item.getName(  ) );
377         }
378 
379         return typeList;
380     }
381 
382     /**
383      * {@inheritDoc}
384      */
385     @Override
386     public String getSpecificSearchAppUrl(  )
387     {
388         return JSP_PAGE_ADVANCED_SEARCH;
389     }
390 }