1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.plugins.document.modules.solr.indexer;
35
36 import fr.paris.lutece.plugins.document.business.Document;
37 import fr.paris.lutece.plugins.document.business.DocumentHome;
38 import fr.paris.lutece.plugins.document.business.DocumentType;
39 import fr.paris.lutece.plugins.document.business.DocumentTypeHome;
40 import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
41 import fr.paris.lutece.plugins.document.business.attributes.DocumentAttributeHome;
42 import fr.paris.lutece.plugins.document.business.category.Category;
43 import fr.paris.lutece.plugins.document.business.portlet.DocumentListPortletHome;
44 import fr.paris.lutece.plugins.document.business.portlet.DocumentPortletHome;
45 import fr.paris.lutece.plugins.document.service.publishing.PublishingService;
46 import fr.paris.lutece.plugins.document.utils.DocumentIndexerUtils;
47 import fr.paris.lutece.plugins.leaflet.business.GeolocItem;
48 import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
49 import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
50 import fr.paris.lutece.plugins.search.solr.business.field.Field;
51 import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexer;
52 import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexerService;
53 import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
54 import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
55 import fr.paris.lutece.portal.business.page.Page;
56 import fr.paris.lutece.portal.business.page.PageHome;
57 import fr.paris.lutece.portal.business.portlet.Portlet;
58 import fr.paris.lutece.portal.business.portlet.PortletHome;
59 import fr.paris.lutece.portal.service.spring.SpringContextService;
60 import fr.paris.lutece.portal.service.util.AppException;
61 import fr.paris.lutece.portal.service.util.AppLogService;
62 import fr.paris.lutece.portal.service.util.AppPropertiesService;
63 import fr.paris.lutece.util.url.UrlItem;
64
65 import org.apache.commons.lang.StringUtils;
66 import org.apache.tika.exception.TikaException;
67 import org.apache.tika.metadata.Metadata;
68 import org.apache.tika.parser.ParseContext;
69 import org.apache.tika.parser.html.HtmlParser;
70 import org.apache.tika.sax.BodyContentHandler;
71 import org.xml.sax.ContentHandler;
72 import org.xml.sax.SAXException;
73
74 import java.io.ByteArrayInputStream;
75 import java.io.IOException;
76 import java.text.DateFormat;
77 import java.text.ParseException;
78 import java.text.SimpleDateFormat;
79 import java.util.ArrayList;
80 import java.util.Collection;
81 import java.util.Date;
82 import java.util.GregorianCalendar;
83 import java.util.Iterator;
84 import java.util.List;
85
86
87
88
89
90 public class SolrDocIndexer implements SolrIndexer
91 {
92 public static final String BEAN_NAME = "document-solr.solrDocIndexer";
93
94
95 private static final String PARAMETER_PORTLET_ID = "portlet_id";
96 private static final String PROPERTY_INDEXER_ENABLE = "solr.indexer.document.enable";
97 private static final String PROPERTY_DOCUMENT_MAX_CHARS = "document-solr.indexer.document.characters.limit";
98 private static final String PROPERTY_NAME = "document-solr.indexer.name";
99 private static final String PROPERTY_DESCRIPTION = "document-solr.indexer.description";
100 private static final String PROPERTY_VERSION = "document-solr.indexer.version";
101 private static final String PROPERTY_DOCUMENT_PORTLET_ENABLE = "document-solr.indexer.documentPortlet.enable";
102 private static final String PARAMETER_DOCUMENT_ID = "document_id";
103 private static final String PARAMETER_ATTRIBUTE_ID = "id_attribute";
104 private static final List<String> LIST_RESSOURCES_NAME = new ArrayList<String>( );
105 private static final String SHORT_NAME = "doc";
106 private static final String DOC_INDEXATION_ERROR = "[SolrDocIndexer] An error occured during the indexation of the document number ";
107
108 private static final String PARAMETER_TYPE_NUMERICTEXT = "numerictext";
109 private static final String PARAMETER_TYPE_GEOLOC = "geoloc";
110 private static final String PARAMETER_TYPE_DATE = "date";
111
112 private static final String PROPERTY_WRITER_MAX_FIELD_LENGTH = "search.lucene.writer.maxFieldLength";
113 private static final int DEFAULT_WRITER_MAX_FIELD_LENGTH = 1000000;
114
115
116
117
118 public SolrDocIndexer( )
119 {
120 LIST_RESSOURCES_NAME.add( DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE );
121 }
122
123 @Override
124 public boolean isEnable( )
125 {
126 return "true".equalsIgnoreCase( AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE ) );
127 }
128
129
130
131
132
133
134 public boolean isDocumentPortletEnable( )
135 {
136 return Boolean.TRUE.equals( AppPropertiesService.getPropertyBoolean( PROPERTY_DOCUMENT_PORTLET_ENABLE, Boolean.FALSE ) );
137 }
138
139
140
141
142 @Override
143 public List<String> indexDocuments( )
144 {
145 List<String> lstErrors = new ArrayList<String>( );
146 List<Integer> listDocument = new ArrayList<Integer>( );
147
148
149 List<Portlet> portletList = PortletHome.findByType( DocumentListPortletHome.getInstance( ).getPortletTypeId( ) );
150
151
152 if ( isDocumentPortletEnable( ) )
153 {
154 portletList.addAll( PortletHome.findByType( DocumentPortletHome.getInstance( ).getPortletTypeId( ) ) );
155 }
156
157 for ( Portlet portlet : portletList )
158 {
159 Collection<SolrItem> solrItems = new ArrayList<SolrItem>( );
160
161 for ( Document d : PublishingService.getInstance( ).getPublishedDocumentsByPortletId( portlet.getId( ) ) )
162 {
163 try
164 {
165
166 Document document = DocumentHome.findByPrimaryKey( d.getId( ) );
167
168 if ( document != null && !listDocument.contains( document.getId( ) ) )
169 {
170
171 SolrItem item = getItem( portlet, document );
172
173 if ( item != null )
174 {
175 solrItems.add( getItem( portlet, document ) );
176 }
177 listDocument.add( document.getId( ) );
178 }
179 }
180 catch( Exception e )
181 {
182 lstErrors.add( DOC_INDEXATION_ERROR + d.getId( ) + " : " + SolrIndexerService.buildErrorMessage( e ) );
183 AppLogService.error( DOC_INDEXATION_ERROR + d.getId( ), e );
184
185 }
186 }
187
188 try
189 {
190 SolrIndexerService.write( solrItems );
191 }
192 catch( Exception e )
193 {
194 lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
195 AppLogService.error( DOC_INDEXATION_ERROR, e );
196 }
197 }
198
199 return lstErrors;
200 }
201
202
203
204
205
206
207
208
209 public List<String> indexListDocuments( Portlet portlet, List<Integer> listIdDocument ) throws Exception
210 {
211 List<String> lstErrors = new ArrayList<>( );
212 StringBuilder sbLogs = new StringBuilder( );
213
214 Collection<SolrItem> solrItems = new ArrayList<>( );
215 for ( Integer d : listIdDocument )
216 {
217 Document document = DocumentHome.findByPrimaryKey( d );
218 if ( document != null )
219 {
220 SolrItem item = getItem( portlet, document );
221
222 if ( item != null )
223 {
224 solrItems.add( getItem( portlet, document ) );
225 }
226 }
227 }
228
229 try
230 {
231 SolrIndexerService.write( solrItems, sbLogs );
232
233 }
234 catch( Exception e )
235 {
236 lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
237 lstErrors.add( sbLogs.toString( ) );
238 AppLogService.error( DOC_INDEXATION_ERROR, e );
239
240 }
241
242 return lstErrors;
243 }
244
245
246
247
248
249
250
251
252
253
254
255 private SolrItem getItem( Portlet portlet, Document document ) throws Exception
256 {
257
258 SolrItem item = new SolrItem( );
259 item.setUid( getResourceUid( Integer.valueOf( document.getId( ) ).toString( ), DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE ) );
260 item.setDate( document.getDateModification( ) );
261 item.setType( document.getType( ) );
262 item.setSummary( document.getSummary( ) );
263 item.setTitle( document.getTitle( ) );
264 item.setSite( SolrIndexerService.getWebAppName( ) );
265 item.setRole( "none" );
266
267 if ( portlet != null )
268 {
269 item.setDocPortletId( document.getId( ) + SolrConstants.CONSTANT_AND + portlet.getId( ) );
270 }
271
272 item.setXmlContent( document.getXmlValidatedContent( ) );
273
274
275 UrlItem url = new UrlItem( SolrIndexerService.getBaseUrl( ) );
276 url.addParameter( PARAMETER_DOCUMENT_ID, document.getId( ) );
277 url.addParameter( PARAMETER_PORTLET_ID, portlet.getId( ) );
278 item.setUrl( url.getUrl( ) );
279
280
281 GregorianCalendar calendar = new GregorianCalendar( );
282 calendar.setTime( document.getDateModification( ) );
283 item.setHieDate( calendar.get( GregorianCalendar.YEAR ) + "/" + ( calendar.get( GregorianCalendar.MONTH ) + 1 ) + "/"
284 + calendar.get( GregorianCalendar.DAY_OF_MONTH ) + "/" );
285
286 List<String> categorie = new ArrayList<String>( );
287
288 for ( Category cat : document.getCategories( ) )
289 {
290 categorie.add( cat.getName( ) );
291 }
292
293 item.setCategorie( categorie );
294
295
296 String strContentToIndex = getContentToIndex( document, item );
297 String strMaxChars = AppPropertiesService.getProperty( PROPERTY_DOCUMENT_MAX_CHARS );
298 int nMaxChars;
299 if ( StringUtils.isNotBlank( strMaxChars ) )
300 {
301 nMaxChars = Integer.parseInt( strMaxChars );
302 }
303 else
304 {
305 nMaxChars = AppPropertiesService.getPropertyInt( PROPERTY_WRITER_MAX_FIELD_LENGTH, DEFAULT_WRITER_MAX_FIELD_LENGTH );
306 }
307 ContentHandler handler = new BodyContentHandler( nMaxChars );
308
309 Metadata metadata = new Metadata( );
310
311 new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata, new ParseContext( ) );
312 item.setContent( handler.toString( ) );
313
314 return item;
315 }
316
317
318
319
320
321
322
323
324
325
326 private static String getContentToIndex( Document document, SolrItem item )
327 {
328 StringBuilder sbContentToIndex = new StringBuilder( );
329 sbContentToIndex.append( document.getTitle( ) );
330 sbContentToIndex.append( " " );
331
332 for ( DocumentAttribute attribute : document.getAttributes( ) )
333 {
334 if ( attribute.isSearchable( ) )
335 {
336 if ( !attribute.isBinary( ) )
337 {
338 if ( PARAMETER_TYPE_GEOLOC.equalsIgnoreCase( attribute.getCodeAttributeType( ) ) )
339 {
340
341 String address = null;
342 GeolocItem geolocItem = null;
343 try
344 {
345 geolocItem = GeolocItem.fromJSON( attribute.getTextValue( ) );
346 }
347 catch( IOException e )
348 {
349 AppLogService.error( "SolrDocumentIndexer, error parsing JSON " + e.getMessage( ), e );
350 }
351 if ( geolocItem != null && geolocItem.getAddress( ) != null )
352 {
353 sbContentToIndex.append( geolocItem.getAddress( ) );
354 }
355 }
356 else
357 {
358
359 sbContentToIndex.append( attribute.getTextValue( ) );
360 }
361 sbContentToIndex.append( " " );
362
363
364
365 if ( PARAMETER_TYPE_NUMERICTEXT.equalsIgnoreCase( attribute.getCodeAttributeType( ) ) )
366 {
367 Long nI = StringUtils.isNotEmpty( attribute.getTextValue( ) ) && StringUtils.isNumeric( attribute.getTextValue( ).trim( ) )
368 ? Long.valueOf( attribute.getTextValue( ).trim( ) )
369 : 0;
370 item.addDynamicField( attribute.getCode( ), nI );
371 }
372 else
373 if ( PARAMETER_TYPE_GEOLOC.equalsIgnoreCase( attribute.getCodeAttributeType( ) ) )
374 {
375 item.addDynamicFieldGeoloc( attribute.getCode( ), attribute.getTextValue( ), document.getCodeDocumentType( ) );
376 }
377 else
378 if ( PARAMETER_TYPE_DATE.equalsIgnoreCase( attribute.getCodeAttributeType( ) ) && !"".equals( attribute.getTextValue( ) ) )
379 {
380
381 DateFormat format = new SimpleDateFormat( "dd/MM/yyyy" );
382 try
383 {
384 Date date = format.parse( attribute.getTextValue( ) );
385 item.addDynamicField( attribute.getCode( ), date );
386 }
387 catch( ParseException e )
388 {
389 AppLogService.error( e.getMessage( ), e );
390 }
391 }
392 else
393 item.addDynamicField( attribute.getCode( ), attribute.getTextValue( ) );
394 }
395 else
396 {
397
398
399 IFileIndexerFactory _factoryIndexer = (IFileIndexerFactory) SpringContextService.getBean( IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY );
400 IFileIndexer indexer = _factoryIndexer.getIndexer( attribute.getValueContentType( ) );
401
402 if ( indexer != null )
403 {
404 try
405 {
406 ByteArrayInputStream bais = new ByteArrayInputStream( attribute.getBinaryValue( ) );
407 sbContentToIndex.append( indexer.getContentToIndex( bais ) );
408 sbContentToIndex.append( " " );
409 bais.close( );
410 }
411 catch( IOException e )
412 {
413 AppLogService.error( e.getMessage( ), e );
414 }
415 }
416 else
417 {
418 AppLogService.debug( "No indexer found. Url to this data will be given instead" );
419
420 String strName = attribute.getCode( ) + "_" + attribute.getCodeAttributeType( ) + "_url";
421 UrlItem url = new UrlItem( SolrIndexerService.getBaseUrl( ) );
422 url.addParameter( PARAMETER_DOCUMENT_ID, document.getId( ) );
423 url.addParameter( PARAMETER_ATTRIBUTE_ID, attribute.getId( ) );
424 item.addDynamicField( strName, url.getUrl( ) );
425 }
426 }
427 }
428 }
429
430
431 if ( document.getXmlMetadata( ) != null )
432 {
433 sbContentToIndex.append( document.getXmlMetadata( ) );
434 }
435
436 return sbContentToIndex.toString( );
437 }
438
439
440
441
442
443
444
445 @Override
446 public String getName( )
447 {
448 return AppPropertiesService.getProperty( PROPERTY_NAME );
449 }
450
451
452
453
454
455
456 @Override
457 public String getVersion( )
458 {
459 return AppPropertiesService.getProperty( PROPERTY_VERSION );
460 }
461
462
463
464
465 @Override
466 public String getDescription( )
467 {
468 return AppPropertiesService.getProperty( PROPERTY_DESCRIPTION );
469 }
470
471
472
473
474 @Override
475 public List<Field> getAdditionalFields( )
476 {
477 Collection<DocumentType> cAllTypes = DocumentTypeHome.findAll( );
478 List<Field> lstFields = new ArrayList<Field>( );
479
480 for ( DocumentType type : cAllTypes )
481 {
482 DocumentAttributeHome.setDocumentTypeAttributes( type );
483
484 for ( DocumentAttribute attribute : type.getAttributes( ) )
485 {
486 Field field = new Field( );
487 field.setEnableFacet( true );
488 field.setDescription( attribute.getDescription( ) );
489 field.setIsFacet( true );
490 field.setName( attribute.getCode( ) + SolrItem.DYNAMIC_TEXT_FIELD_SUFFIX );
491 field.setLabel( attribute.getName( ) );
492
493 lstFields.add( field );
494 }
495 }
496
497 return lstFields;
498 }
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518 private SolrItem getDocument( Document document, String strUrl, String strRole, String strPortletDocumentId ) throws IOException, InterruptedException
519 {
520
521 SolrItem item = new SolrItem( );
522
523
524
525 item.setUrl( strUrl );
526
527
528 item.setDocPortletId( strPortletDocumentId );
529
530
531
532
533 item.setDate( document.getDateModification( ) );
534
535
536
537
538 String strIdDocument = String.valueOf( document.getId( ) );
539 item.setUid( getResourceUid( strIdDocument, DocumentIndexerUtils.CONSTANT_TYPE_RESOURCE ) );
540
541 String strContentToIndex = getContentToIndex( document, item );
542 ContentHandler handler = new BodyContentHandler( );
543 Metadata metadata = new Metadata( );
544
545 try
546 {
547 new org.apache.tika.parser.html.HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata,
548 new ParseContext( ) );
549 }
550 catch( SAXException e )
551 {
552 throw new AppException( "Error during document parsing." );
553 }
554 catch( TikaException e )
555 {
556 throw new AppException( "Error during document parsing." );
557 }
558
559
560
561 item.setContent( handler.toString( ) );
562
563
564
565 item.setTitle( document.getTitle( ) );
566
567 item.setType( document.getType( ) );
568
569 item.setRole( strRole );
570
571 item.setSite( SolrIndexerService.getWebAppName( ) );
572
573
574 return item;
575 }
576
577
578
579
580 @Override
581 public List<SolrItem> getDocuments( String strIdDocument )
582 {
583 List<SolrItem> lstItems = new ArrayList<SolrItem>( );
584
585 int nIdDocument = Integer.parseInt( strIdDocument );
586 Document document = DocumentHome.findByPrimaryKey( nIdDocument );
587 Iterator<Portlet> it = PublishingService.getInstance( ).getPortletsByDocumentId( Integer.toString( nIdDocument ) ).iterator( );
588
589 try
590 {
591 while ( it.hasNext( ) )
592 {
593 Portlet portlet = it.next( );
594 UrlItem url = new UrlItem( SolrIndexerService.getBaseUrl( ) );
595 url.addParameter( PARAMETER_DOCUMENT_ID, nIdDocument );
596 url.addParameter( PARAMETER_PORTLET_ID, portlet.getId( ) );
597
598 String strPortletDocumentId = nIdDocument + "&" + portlet.getId( );
599 Page page = PageHome.getPage( portlet.getPageId( ) );
600
601 lstItems.add( getDocument( document, url.getUrl( ), page.getRole( ), strPortletDocumentId ) );
602 }
603 }
604 catch( Exception e )
605 {
606 throw new RuntimeException( e );
607 }
608
609 return lstItems;
610 }
611
612
613
614
615 @Override
616 public List<String> getResourcesName( )
617 {
618 return LIST_RESSOURCES_NAME;
619 }
620
621
622
623
624 @Override
625 public String getResourceUid( String strResourceId, String strResourceType )
626 {
627 StringBuilder sb = new StringBuilder( strResourceId );
628 sb.append( SolrConstants.CONSTANT_UNDERSCORE ).append( SHORT_NAME );
629
630 return sb.toString( );
631 }
632 }