View Javadoc
1   /*
2    * Copyright (c) 2002-2020, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.directories.modules.solr.indexer;
35  
36  import fr.paris.lutece.plugins.directories.business.DirectoryEntity;
37  import fr.paris.lutece.plugins.directories.service.DirectoriesService;
38  import fr.paris.lutece.plugins.directories.util.DirectoriesUtils;
39  import fr.paris.lutece.plugins.genericattributes.business.Entry;
40  import fr.paris.lutece.plugins.genericattributes.business.FieldHome;
41  import fr.paris.lutece.plugins.genericattributes.business.Response;
42  import fr.paris.lutece.plugins.search.solr.business.field.Field;
43  import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexer;
44  import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexerService;
45  import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
46  import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
47  import fr.paris.lutece.portal.service.util.AppException;
48  import fr.paris.lutece.portal.service.util.AppLogService;
49  import fr.paris.lutece.portal.service.util.AppPropertiesService;
50  import fr.paris.lutece.util.url.UrlItem;
51  import javassist.expr.NewArray;
52  
53  import org.apache.tika.exception.TikaException;
54  import org.apache.tika.metadata.Metadata;
55  import org.apache.tika.parser.ParseContext;
56  import org.apache.tika.parser.html.HtmlParser;
57  import org.apache.tika.sax.BodyContentHandler;
58  import org.xml.sax.ContentHandler;
59  import org.xml.sax.SAXException;
60  import java.io.ByteArrayInputStream;
61  import java.io.IOException;
62  import java.text.Normalizer;
63  import java.util.ArrayList;
64  import java.util.Arrays;
65  import java.util.Collection;
66  import java.util.GregorianCalendar;
67  import java.util.List;
68  import java.util.Map;
69  import java.util.regex.Pattern;
70  import java.util.stream.Collectors;
71  
72  /**
73   * The indexer service for Solr.
74   *
75   */
76  public class SolrDocIndexer implements SolrIndexer
77  {
78      public static final String BEAN_NAME = "directories-solr.solrDocIndexer";
79      private static final String TYPE = "directories";
80      private static final String PARAMETER_ENTITY_ID = "entity_id";
81      private static final String PROPERTY_INDEXER_ENABLE = "solr.indexer.document.enable";
82      private static final String PROPERTY_DOCUMENT_MAX_CHARS = "directories-solr.indexer.document.characters.limit";
83      private static final String PROPERTY_NAME = "directories-solr.indexer.name";
84      private static final String PROPERTY_DESCRIPTION = "directories-solr.indexer.description";
85      private static final String PROPERTY_VERSION = "directories-solr.indexer.version";
86      private static final String PARAMETER_XPAGE = "page";
87      private static final String XPAGE_DIRECTORIES = "directories";
88      private static final String PARAMETER_VIEW = "view";
89      private static final String PARAMETER_VIEW_ENTITY = "viewDirectoryEntity";
90      private static final List<String> LIST_RESSOURCES_NAME = new ArrayList<String>( );
91      private static final String SHORT_NAME = "entity";
92      private static final String DOC_INDEXATION_ERROR = "[SolrDirectoriesIndexer] An error occured during the indexation of the document number ";
93      private static final Integer PARAMETER_DOCUMENT_MAX_CHARS = Integer.parseInt( AppPropertiesService.getProperty( PROPERTY_DOCUMENT_MAX_CHARS ) );
94  
95      /**
96       * Creates a new SolrPageIndexer
97       */
98      public SolrDocIndexer( )
99      {
100         LIST_RESSOURCES_NAME.add( DirectoriesUtils.CONSTANT_TYPE_RESOURCE );
101     }
102 
103     @Override
104     public boolean isEnable( )
105     {
106         return "true".equalsIgnoreCase( AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE ) );
107     }
108 
109     /**
110      * {@inheritDoc}
111      */
112     @Override
113     public List<String> indexDocuments( )
114     {
115         List<String> lstErrors = new ArrayList<String>( );
116         List<Integer> listDocument = new ArrayList<Integer>( );
117         Collection<SolrItem> solrItems = new ArrayList<SolrItem>( );
118         for ( DirectoryEntity document : DirectoriesService.getInstance( ).getListDocWithoutBinaries( ) )
119         {
120             try
121             {
122                 if ( !listDocument.contains( document.getId( ) ) )
123                 {
124                     // Generates the item to index
125                     SolrItem item = getItem( document );
126                     if ( item != null )
127                     {
128                         solrItems.add( item );
129                     }
130                     listDocument.add( document.getId( ) );
131                 }
132             }
133             catch( Exception e )
134             {
135                 lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
136                 AppLogService.error( DOC_INDEXATION_ERROR + document.getId( ), e );
137             }
138         }
139         try
140         {
141             SolrIndexerService.write( solrItems );
142         }
143         catch( Exception e )
144         {
145             lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
146             AppLogService.error( DOC_INDEXATION_ERROR, e );
147         }
148         return lstErrors;
149     }
150 
151     /**
152      * Get item
153      * 
154      * @param portlet
155      *            The portlet
156      * @param document
157      *            The document
158      * @return The item
159      * @throws IOException
160      */
161     private SolrItem getItem( DirectoryEntity document ) throws IOException
162     {
163         // the item
164         SolrItem item = new SolrItem( );
165         item.setUid( getResourceUid( Integer.valueOf( document.getId( ) ).toString( ), DirectoriesUtils.CONSTANT_TYPE_RESOURCE ) );
166         item.setDate( document.getCreation( ) );
167         item.setType( TYPE );
168         item.setSite( SolrIndexerService.getWebAppName( ) );
169         item.setRole( "none" );
170         item.setTitle( document.getTitle( ) );
171         // Reload the full object to get all its searchable attributes
172         UrlItem url = new UrlItem( SolrIndexerService.getBaseUrl( ) );
173         url.addParameter( PARAMETER_XPAGE, XPAGE_DIRECTORIES );
174         url.addParameter( PARAMETER_VIEW, PARAMETER_VIEW_ENTITY );
175         url.addParameter( PARAMETER_ENTITY_ID, document.getId( ) );
176         item.setUrl( url.getUrl( ) );
177         // Date Hierarchy
178         GregorianCalendar calendar = new GregorianCalendar( );
179         calendar.setTime( document.getCreation( ) );
180         item.setHieDate( calendar.get( GregorianCalendar.YEAR ) + "/" + ( calendar.get( GregorianCalendar.MONTH ) + 1 ) + "/"
181                 + calendar.get( GregorianCalendar.DAY_OF_MONTH ) );
182         // The content
183         String strContentToIndex = getContentToIndex( document, item );
184         ContentHandler handler = null;
185         if ( PARAMETER_DOCUMENT_MAX_CHARS != null )
186         {
187             handler = new BodyContentHandler( PARAMETER_DOCUMENT_MAX_CHARS );
188         }
189         else
190         {
191             handler = new BodyContentHandler( );
192         }
193         Metadata metadata = new Metadata( );
194         try
195         {
196             new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata, new ParseContext( ) );
197         }
198         catch( SAXException e )
199         {
200             throw new AppException( "Error during document parsing." );
201         }
202         catch( TikaException e )
203         {
204             throw new AppException( "Error during document parsing." );
205         }
206         item.setContent( handler.toString( ) );
207         return item;
208     }
209 
210     /**
211      * GEt the content to index
212      * 
213      * @param document
214      *            The document
215      * @param item
216      *            The SolR item
217      * @return The content
218      */
219     private static String getContentToIndex( DirectoryEntity document, SolrItem item )
220     {
221         StringBuilder sbContentToIndex = new StringBuilder( );
222         List<Response> listResponse = document.getResponses( );
223 
224         Map<Entry, List<Response>> entryMap = listResponse.stream( ).collect( Collectors.groupingBy( Response::getEntry ) );
225 
226         entryMap.forEach( ( entry, listResponseFiltered ) -> {
227 
228             String strFieldName = "attribute" + listResponseFiltered.get( 0 ).getEntry( ).getIdEntry( );
229             List<String> valueList = new ArrayList<>( );
230 
231             if ( listResponseFiltered.get( 0 ).getField( ) != null )
232             {
233                 for ( Response response : listResponseFiltered )
234                 {
235                     int nIdField = response.getField( ).getIdField( );
236                     fr.paris.lutece.plugins.genericattributes.business.Field field = FieldHome.findByPrimaryKey( nIdField );
237                     String value = field.getTitle( );
238                     if ( value == null )
239                     {
240                         value = response.getResponseValue( );
241                     }
242                     valueList.add( value );
243                     sbContentToIndex.append( " " );
244                     sbContentToIndex.append( value );
245                 }
246                 item.addDynamicField( strFieldName, valueList );
247             }
248             else
249             {
250                 String value = listResponseFiltered.get( 0 ).getResponseValue( );
251                 if ( value != null )
252                 {
253                     item.addDynamicField( strFieldName, value );
254                     item.addDynamicFieldNotAnalysed( strFieldName, value );
255                     sbContentToIndex.append( " " );
256                     sbContentToIndex.append( value );
257                 }
258             }
259         } );
260 
261         String strContentDistinct = Arrays.stream( sbContentToIndex.toString( ).split( "\\s+" ) ).distinct( ).collect( Collectors.joining( " " ) );
262         String newStrContentDistinct = strContentDistinct.replaceAll( "null", "" );
263         StringBuilder sb = new StringBuilder( newStrContentDistinct );
264 
265         return sb.toString( );
266     }
267 
268     // GETTERS & SETTERS
269     /**
270      * Returns the name of the indexer.
271      *
272      * @return the name of the indexer
273      */
274     @Override
275     public String getName( )
276     {
277         return AppPropertiesService.getProperty( PROPERTY_NAME );
278     }
279 
280     /**
281      * Returns the version.
282      *
283      * @return the version.
284      */
285     @Override
286     public String getVersion( )
287     {
288         return AppPropertiesService.getProperty( PROPERTY_VERSION );
289     }
290 
291     /**
292      * {@inheritDoc}
293      */
294     @Override
295     public String getDescription( )
296     {
297         return AppPropertiesService.getProperty( PROPERTY_DESCRIPTION );
298     }
299 
300     /**
301      * {@inheritDoc}
302      */
303     @Override
304     public List<Field> getAdditionalFields( )
305     {
306         List<Field> lstFields = new ArrayList<Field>( );
307         return lstFields;
308     }
309 
310     /**
311      * {@inheritDoc}
312      */
313     @Override
314     public List<SolrItem> getDocuments( String strIdDocument )
315     {
316         List<SolrItem> lstItems = new ArrayList<SolrItem>( );
317         return lstItems;
318     }
319 
320     /**
321      * {@inheritDoc}
322      */
323     @Override
324     public List<String> getResourcesName( )
325     {
326         return LIST_RESSOURCES_NAME;
327     }
328 
329     /**
330      * {@inheritDoc}
331      */
332     @Override
333     public String getResourceUid( String strResourceId, String strResourceType )
334     {
335         StringBuilder sb = new StringBuilder( strResourceId );
336         sb.append( SolrConstants.CONSTANT_UNDERSCORE ).append( SHORT_NAME );
337         return sb.toString( );
338     }
339 
340 }