View Javadoc
1   /*
2    * Copyright (c) 2002-2017, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.directory.modules.solr.search;
35  
36  import java.io.ByteArrayInputStream;
37  import java.io.IOException;
38  import java.io.InputStream;
39  import java.io.Reader;
40  import java.io.StringReader;
41  import java.nio.charset.StandardCharsets;
42  import java.util.ArrayList;
43  import java.util.Collections;
44  import java.util.List;
45  
46  import org.apache.commons.lang.StringUtils;
47  import org.apache.tika.exception.TikaException;
48  import org.apache.tika.metadata.Metadata;
49  import org.apache.tika.parser.ParseContext;
50  import org.apache.tika.parser.html.HtmlParser;
51  import org.apache.tika.sax.BodyContentHandler;
52  import org.xml.sax.ContentHandler;
53  import org.xml.sax.SAXException;
54  
55  import fr.paris.lutece.plugins.directory.business.Directory;
56  import fr.paris.lutece.plugins.directory.business.DirectoryFilter;
57  import fr.paris.lutece.plugins.directory.business.DirectoryHome;
58  import fr.paris.lutece.plugins.directory.business.EntryFilter;
59  import fr.paris.lutece.plugins.directory.business.EntryHome;
60  import fr.paris.lutece.plugins.directory.business.IEntry;
61  import fr.paris.lutece.plugins.directory.business.Record;
62  import fr.paris.lutece.plugins.directory.business.RecordField;
63  import fr.paris.lutece.plugins.directory.business.RecordFieldFilter;
64  import fr.paris.lutece.plugins.directory.business.RecordFieldHome;
65  import fr.paris.lutece.plugins.directory.business.RecordHome;
66  import fr.paris.lutece.plugins.directory.service.DirectoryPlugin;
67  import fr.paris.lutece.plugins.directory.utils.DirectoryIndexerUtils;
68  import fr.paris.lutece.plugins.directory.utils.DirectoryUtils;
69  import fr.paris.lutece.plugins.search.solr.business.field.Field;
70  import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexer;
71  import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexerService;
72  import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
73  import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
74  import fr.paris.lutece.portal.service.content.XPageAppService;
75  import fr.paris.lutece.portal.service.plugin.Plugin;
76  import fr.paris.lutece.portal.service.plugin.PluginService;
77  import fr.paris.lutece.portal.service.util.AppLogService;
78  import fr.paris.lutece.portal.service.util.AppPropertiesService;
79  import fr.paris.lutece.util.url.UrlItem;
80  
81  
82  /**
83   * The Directory indexer for Solr search platform
84   *
85   */
86  public class SolrDirectoryIndexer implements SolrIndexer
87  {
88      private static final String PROPERTY_DESCRIPTION = "directory-solr.indexer.description";
89      private static final String PROPERTY_NAME = "directory-solr.indexer.name";
90      private static final String PROPERTY_VERSION = "directory-solr.indexer.version";
91      private static final String PROPERTY_INDEXER_ENABLE = "directory-solr.indexer.enable";
92  
93      public static final String SHORT_NAME = "dry";
94      private static final String DIRECTORY = "directory";
95      private static final String PARAMETER_ID_DIRECTORY_RECORD = "id_directory_record";
96      private static final String PARAMETER_VIEW_DIRECTORY_RECORD = "view_directory_record";
97      private static final String ROLE_NONE = "none";
98      private static final List<String> LIST_RESSOURCES_NAME = new ArrayList<String>(  );
99  
100     private static final String DIRECTORY_INDEXATION_ERROR = "[SolrDirectoryIndexer] An error occured during the indexation of the record number ";
101     
102     public SolrDirectoryIndexer(  )
103     {
104         super(  );
105 
106         LIST_RESSOURCES_NAME.add( DirectoryIndexerUtils.CONSTANT_TYPE_RESOURCE );
107     }
108 
109     /**
110      * {@inheritDoc}
111      */
112     public String getDescription(  )
113     {
114         return AppPropertiesService.getProperty( PROPERTY_DESCRIPTION );
115     }
116 
117     /**
118      * {@inheritDoc}
119      */
120     public String getName(  )
121     {
122         return AppPropertiesService.getProperty( PROPERTY_NAME );
123     }
124 
125     /**
126      * {@inheritDoc}
127      */
128     public String getVersion(  )
129     {
130         return AppPropertiesService.getProperty( PROPERTY_VERSION );
131     }
132 
133     /**
134      * {@inheritDoc}
135      */
136     public List<String> indexDocuments(  )
137     {
138         Plugin plugin = PluginService.getPlugin( DirectoryPlugin.PLUGIN_NAME );
139         List<String> lstErrors = new ArrayList<String>(  );
140         
141         // Index only the directories that have the attribute is_indexed as true
142         DirectoryFilter dirFilter = new DirectoryFilter(  );
143         dirFilter.setIsIndexed( DirectoryFilter.FILTER_TRUE );
144         dirFilter.setIsDisabled( DirectoryFilter.FILTER_TRUE ); //Bad naming: IsDisable( true ) stands for enabled
145 
146         for ( Directory directory : DirectoryHome.getDirectoryList( dirFilter, plugin ) )
147         {
148         	try
149         	{
150         		int nIdDirectory = directory.getIdDirectory(  );
151 
152         		//Index only the records that have the attribute is_enable as true
153         		RecordFieldFilter recFilter = new RecordFieldFilter(  );
154         		recFilter.setIdDirectory( nIdDirectory );
155         		recFilter.setIsDisabled( RecordFieldFilter.FILTER_TRUE ); //Bad naming: IsDisable( true ) stands for enabled
156 
157         		List<Record> listRecord = RecordHome.getListRecord( recFilter, plugin );
158 
159         		//Keep processing this directory only if there are enabled records
160         		if ( !listRecord.isEmpty(  ) )
161         		{
162         			//Parse the entries to gather the ones marked as indexed
163         			EntryFilter entryFilter = new EntryFilter(  );
164         			entryFilter.setIdDirectory( nIdDirectory );
165         			entryFilter.setIsIndexed( EntryFilter.FILTER_TRUE );
166 
167         			List<IEntry> listIndexedEntry = EntryHome.getEntryList( entryFilter, plugin );
168 
169         			entryFilter.setIsIndexed( EntryFilter.ALL_INT );
170         			entryFilter.setIsIndexedAsTitle( EntryFilter.FILTER_TRUE );
171 
172         			List<IEntry> listIndexedAsTitleEntry = EntryHome.getEntryList( entryFilter, plugin );
173 
174         			entryFilter.setIsIndexedAsTitle( EntryFilter.ALL_INT );
175         			entryFilter.setIsIndexedAsSummary( EntryFilter.FILTER_TRUE );
176 
177         			List<IEntry> listIndexedAsSummaryEntry = EntryHome.getEntryList( entryFilter, plugin );
178 
179         			for ( Record record : listRecord )
180         			{
181         				SolrItem recordDoc = getDocument( record, listIndexedEntry, listIndexedAsTitleEntry,
182         						listIndexedAsSummaryEntry, plugin );
183 
184         				if ( recordDoc != null )
185         				{
186         					SolrIndexerService.write( recordDoc );
187         				}
188         			}
189         		}
190         	}
191         	catch ( Exception e )
192         	{
193         		lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
194 				AppLogService.error( DIRECTORY_INDEXATION_ERROR + directory.getIdDirectory(  ), e );
195 			}
196         }
197         
198         return lstErrors;
199     }
200 
201     /**
202      * {@inheritDoc}
203      */
204     public boolean isEnable(  )
205     {
206         return "true".equalsIgnoreCase( AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE ) );
207     }
208 
209     /**
210      * {@inheritDoc}
211      */
212     public List<Field> getAdditionalFields(  )
213     {
214         return new ArrayList<Field>(  );
215     }
216 
217     /**
218      * {@inheritDoc}
219      */
220     public List<SolrItem> getDocuments( String recordId )
221     {
222         Plugin plugin = PluginService.getPlugin( DirectoryPlugin.PLUGIN_NAME );
223 
224         int nIdRecord;
225 
226         try
227         {
228             nIdRecord = Integer.parseInt( recordId );
229         }
230         catch ( NumberFormatException ne )
231         {
232             AppLogService.error( recordId + " not parseable to an int", ne );
233 
234             return new ArrayList<SolrItem>( 0 );
235         }
236 
237         Record record = RecordHome.findByPrimaryKey( nIdRecord, plugin );
238         Directory directory = record.getDirectory(  );
239 
240         if ( !record.isEnabled(  ) || !directory.isEnabled(  ) || !directory.isIndexed(  ) )
241         {
242             return new ArrayList<SolrItem>( 0 );
243         }
244 
245         int nIdDirectory = directory.getIdDirectory(  );
246 
247         //Parse the entries to gather the ones marked as indexed
248         EntryFilter entryFilter = new EntryFilter(  );
249         entryFilter.setIdDirectory( nIdDirectory );
250         entryFilter.setIsIndexed( EntryFilter.FILTER_TRUE );
251 
252         List<IEntry> listIndexedEntry = EntryHome.getEntryList( entryFilter, plugin );
253 
254         entryFilter.setIsIndexed( EntryFilter.ALL_INT );
255         entryFilter.setIsIndexedAsTitle( EntryFilter.FILTER_TRUE );
256 
257         List<IEntry> listIndexedAsTitleEntry = EntryHome.getEntryList( entryFilter, plugin );
258 
259         entryFilter.setIsIndexedAsTitle( EntryFilter.ALL_INT );
260         entryFilter.setIsIndexedAsSummary( EntryFilter.FILTER_TRUE );
261 
262         List<IEntry> listIndexedAsSummaryEntry = EntryHome.getEntryList( entryFilter, plugin );
263 
264         List<SolrItem> listDocument = Collections.EMPTY_LIST;
265 
266         try
267         {
268             SolrItem doc = getDocument( record, listIndexedEntry, listIndexedAsTitleEntry, listIndexedAsSummaryEntry,
269                     plugin );
270 
271             if ( doc != null )
272             {
273                 listDocument = new ArrayList<SolrItem>( 1 );
274                 listDocument.add( doc );
275             }
276         }
277         catch ( IOException e )
278         {
279             throw new RuntimeException( e );
280         }
281 
282         return listDocument;
283     }
284 
285     /**
286      * Builds a {@link SolrItem} which will be used by Solr during the indexing of this record
287      * @param record the record to convert into a document
288      * @param listContentEntry the entries in this record that are marked as is_indexed
289      * @param listTitleEntry the entries in this record that are marked as is_indexed_as_title
290      * @param listSummaryEntry the entries in this record that are marked as is_indexed_as_summary
291      * @param plugin the plugin object
292      * @return a Solr item filled with the record data
293      * @throws IOException
294      */
295     private SolrItem getDocument( Record record, List<IEntry> listContentEntry, List<IEntry> listTitleEntry,
296         List<IEntry> listSummaryEntry, Plugin plugin )
297         throws IOException
298     {
299         SolrItem item = new SolrItem(  );
300 
301         boolean bFallback = false;
302 
303         //Fallback if there is no entry marker as indexed_as_title
304         //Uses the first indexed field instead
305         if ( listTitleEntry.isEmpty(  ) && !listContentEntry.isEmpty(  ) )
306         {
307             listTitleEntry.add( listContentEntry.get( 0 ) );
308             bFallback = true;
309         }
310 
311         String strTitle = getContentToIndex( record, listTitleEntry, plugin );
312 
313         //Fallback if fields were empty
314         //Uses the first indexed field instead
315         if ( StringUtils.isBlank( strTitle ) && !bFallback && !listContentEntry.isEmpty(  ) )
316         {
317             listTitleEntry.clear(  );
318             listTitleEntry.add( listContentEntry.get( 0 ) );
319             strTitle = getContentToIndex( record, listTitleEntry, plugin );
320         }
321 
322         //No more fallback. Giving up
323         if ( StringUtils.isBlank( strTitle ) )
324         {
325             return null;
326         }
327 
328         // Setting the Title field
329         item.setTitle( strTitle );
330 
331         if ( !listContentEntry.isEmpty(  ) )
332         {
333             String strContent = getContentToIndex( record, listContentEntry, plugin );
334 
335             if ( StringUtils.isNotBlank( strContent ) )
336             {
337                 HtmlParser parser = new HtmlParser(  );
338                 ContentHandler handler = new BodyContentHandler();
339                 Metadata metadata = new Metadata();
340                 InputStream stream = new ByteArrayInputStream(strContent.getBytes(StandardCharsets.UTF_8));
341                 try {
342         			parser.parse(stream,  handler, metadata, new ParseContext());
343         		} catch (SAXException e) {
344         			e.printStackTrace();
345         		} catch (TikaException e) {
346         			e.printStackTrace();
347         		}
348                 item.setContent( handler.toString(  ) );
349             }
350         }
351 
352         if ( !listSummaryEntry.isEmpty(  ) )
353         {
354             String strSummary = getContentToIndex( record, listSummaryEntry, plugin );
355 
356             if ( StringUtils.isNotBlank( strSummary ) )
357             {
358                 // Setting the Summary field
359                 item.setSummary( strSummary );
360             }
361         }
362 
363         String strRoleKey = record.getRoleKey(  );
364 
365         if ( StringUtils.isBlank( strRoleKey ) )
366         {
367             strRoleKey = ROLE_NONE;
368         }
369 
370         // Setting the role field
371         item.setRole( strRoleKey );
372 
373         // Setting the date field
374         item.setDate( record.getDateCreation(  ) );
375 
376         UrlItem url = new UrlItem( SolrIndexerService.getBaseUrl(  ) );
377         url.addParameter( XPageAppService.PARAM_XPAGE_APP, DIRECTORY );
378         url.addParameter( PARAMETER_ID_DIRECTORY_RECORD, record.getIdRecord(  ) );
379         url.addParameter( PARAMETER_VIEW_DIRECTORY_RECORD, "" );
380         // Setting the Url field
381         item.setUrl( url.getUrl(  ) );
382 
383         //Add the uid as a field, so that index can be incrementally maintained.
384         // This field is not stored with question/answer, it is indexed, but it is not
385         // tokenized prior to indexing.
386         // Setting the Uid field
387         item.setUid( getResourceUid( Integer.toString( record.getIdRecord(  ) ),
388                 DirectoryIndexerUtils.CONSTANT_TYPE_RESOURCE ) );
389 
390         // Setting the Type field
391         item.setType( DIRECTORY );
392 
393         // Setting the Site field
394         item.setSite( SolrIndexerService.getWebAppName(  ) );
395 
396         return item;
397     }
398 
399     /**
400      * Concatenates the value of the specified field in this record
401      * @param record the record to seek
402      * @param listEntry the list of field to concatenate
403      * @param plugin the plugin object
404      * @return
405      */
406     private String getContentToIndex( Record record, List<IEntry> listEntry, Plugin plugin )
407     {
408         List<Integer> listIdEntry = new ArrayList<Integer>( listEntry.size(  ) );
409 
410         for ( IEntry entry : listEntry )
411         {
412             listIdEntry.add( entry.getIdEntry(  ) );
413         }
414 
415         StringBuffer sb = new StringBuffer(  );
416 
417         List<RecordField> listField = RecordFieldHome.getRecordFieldSpecificList( listIdEntry, record.getIdRecord(  ),
418                 plugin, DirectoryUtils.getMapFieldsOfListEntry( listEntry, plugin ) );
419 
420         for ( RecordField field : listField )
421         {
422             sb.append( RecordFieldHome.findByPrimaryKey( field.getIdRecordField(  ), plugin ).getValue(  ) );
423             sb.append( " " );
424         }
425 
426         return sb.toString(  );
427     }
428 
429     /**
430      * {@inheritDoc}
431      */
432     public List<String> getResourcesName(  )
433     {
434         return LIST_RESSOURCES_NAME;
435     }
436 
437     /**
438      * {@inheritDoc}
439      */
440     public String getResourceUid( String strResourceId, String strResourceType )
441     {
442         StringBuffer sb = new StringBuffer( strResourceId );
443         sb.append( SolrConstants.CONSTANT_UNDERSCORE ).append( SHORT_NAME );
444 
445         return sb.toString(  );
446     }
447 }