1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.plugins.document.service.search;
35
36 import fr.paris.lutece.plugins.document.business.Document;
37 import fr.paris.lutece.plugins.document.business.DocumentHome;
38 import fr.paris.lutece.plugins.document.business.DocumentTypeHome;
39 import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
40 import fr.paris.lutece.plugins.document.business.portlet.DocumentListPortletHome;
41 import fr.paris.lutece.plugins.document.service.publishing.PublishingService;
42 import fr.paris.lutece.plugins.document.utils.IntegerUtils;
43 import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
44 import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
45 import fr.paris.lutece.portal.business.page.Page;
46 import fr.paris.lutece.portal.business.page.PageHome;
47 import fr.paris.lutece.portal.business.portlet.Portlet;
48 import fr.paris.lutece.portal.business.portlet.PortletHome;
49 import fr.paris.lutece.portal.service.search.IndexationService;
50 import fr.paris.lutece.portal.service.search.SearchIndexer;
51 import fr.paris.lutece.portal.service.search.SearchItem;
52 import fr.paris.lutece.portal.service.spring.SpringContextService;
53 import fr.paris.lutece.portal.service.util.AppException;
54 import fr.paris.lutece.portal.service.util.AppLogService;
55 import fr.paris.lutece.portal.service.util.AppPropertiesService;
56 import fr.paris.lutece.util.ReferenceItem;
57 import fr.paris.lutece.util.url.UrlItem;
58
59 import org.apache.commons.lang3.StringUtils;
60
61 import org.apache.lucene.document.DateTools;
62 import org.apache.lucene.document.Field;
63 import org.apache.lucene.document.FieldType;
64 import org.apache.lucene.document.StoredField;
65 import org.apache.lucene.document.StringField;
66 import org.apache.lucene.document.TextField;
67
68 import org.apache.tika.exception.TikaException;
69 import org.apache.tika.metadata.Metadata;
70 import org.apache.tika.parser.ParseContext;
71 import org.apache.tika.parser.html.HtmlParser;
72 import org.apache.tika.sax.BodyContentHandler;
73
74 import org.xml.sax.ContentHandler;
75 import org.xml.sax.SAXException;
76
77 import java.io.ByteArrayInputStream;
78 import java.io.IOException;
79
80 import java.util.ArrayList;
81 import java.util.Iterator;
82 import java.util.List;
83
84
85
86
87
88 public class DocumentIndexer implements SearchIndexer
89 {
90 public static final String INDEXER_NAME = "DocumentIndexer";
91 public static final String SHORT_NAME = "dcm";
92 private static final String INDEXER_DESCRIPTION = "Indexer service for documents";
93 private static final String INDEXER_VERSION = "1.0.0";
94 private static final String PROPERTY_PAGE_BASE_URL = "document.documentIndexer.baseUrl";
95 private static final String PROPERTY_INDEXER_ENABLE = "document.documentIndexer.enable";
96 private static final String PARAMETER_DOCUMENT_ID = "document_id";
97 private static final String PARAMETER_PORTLET_ID = "portlet_id";
98 private static final String JSP_PAGE_ADVANCED_SEARCH = "jsp/site/Portal.jsp?page=advanced_search";
99 private static final String PROPERTY_WRITER_MAX_FIELD_LENGTH = "search.lucene.writer.maxFieldLength";
100 private static final int DEFAULT_WRITER_MAX_FIELD_LENGTH = 1000000;
101
102
103
104
105
106
107
108 @Override
109 public void indexDocuments( ) throws IOException, InterruptedException
110 {
111 String strBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
112 Page page;
113
114 for ( Portlet portlet : PortletHome.findByType( DocumentListPortletHome.getInstance( ).getPortletTypeId( ) ) )
115 {
116 page = PageHome.getPage( portlet.getPageId( ) );
117
118 for ( Document d : PublishingService.getInstance( ).getPublishedDocumentsByPortletId( portlet.getId( ) ) )
119 {
120 Document document = DocumentHome.findByPrimaryKey( d.getId( ) );
121
122
123 UrlItem url = new UrlItem( strBaseUrl );
124 url.addParameter( PARAMETER_DOCUMENT_ID, document.getId( ) );
125 url.addParameter( PARAMETER_PORTLET_ID, portlet.getId( ) );
126
127 String strPortletDocumentId = document.getId( ) + "_" + SHORT_NAME + "&" + portlet.getId( );
128 org.apache.lucene.document.Document doc = null;
129
130 try
131 {
132 doc = getDocument( document, url.getUrl( ), page.getRole( ), strPortletDocumentId );
133 }
134 catch ( Exception e )
135 {
136 String strMessage = "Document ID : " + document.getId( ) + " - Portlet ID : " + portlet.getId( );
137 IndexationService.error( this, e, strMessage );
138 }
139
140 if ( doc != null )
141 {
142 IndexationService.write( doc );
143 }
144 }
145 }
146 }
147
148
149
150
151
152
153
154
155 @Override
156 public List<org.apache.lucene.document.Document> getDocuments( String strIdDocument )
157 throws IOException, InterruptedException
158 {
159 List<org.apache.lucene.document.Document> listDocs = new ArrayList<org.apache.lucene.document.Document>( );
160 int nIdDocument = IntegerUtils.convert( strIdDocument );
161 Document document = DocumentHome.findByPrimaryKey( nIdDocument );
162 Iterator<Portlet> it = PublishingService.getInstance( ).getPortletsByDocumentId( strIdDocument ).iterator( );
163 String strBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
164 Page page;
165
166 while ( it.hasNext( ) )
167 {
168 Portlet portlet = it.next( );
169 UrlItem url = new UrlItem( strBaseUrl );
170 url.addParameter( PARAMETER_DOCUMENT_ID, nIdDocument );
171 url.addParameter( PARAMETER_PORTLET_ID, portlet.getId( ) );
172
173 String strPortletDocumentId = nIdDocument + "_" + SHORT_NAME + "&" + portlet.getId( );
174
175 page = PageHome.getPage( portlet.getPageId( ) );
176
177 org.apache.lucene.document.Document doc = getDocument( document, url.getUrl( ), page.getRole( ),
178 strPortletDocumentId );
179 listDocs.add( doc );
180 }
181
182 return listDocs;
183 }
184
185
186
187
188
189 @Override
190 public String getName( )
191 {
192 return INDEXER_NAME;
193 }
194
195
196
197
198
199 @Override
200 public String getVersion( )
201 {
202 return INDEXER_VERSION;
203 }
204
205
206
207
208
209 @Override
210 public String getDescription( )
211 {
212 return INDEXER_DESCRIPTION;
213 }
214
215
216
217
218
219 @Override
220 public boolean isEnable( )
221 {
222 String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, "true" );
223
224 return ( strEnable.equalsIgnoreCase( "true" ) );
225 }
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241 public static org.apache.lucene.document.Document getDocument( Document document, String strUrl, String strRole,
242 String strPortletDocumentId ) throws IOException, InterruptedException
243 {
244
245 org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document( );
246
247 FieldType ft = new FieldType( StringField.TYPE_STORED );
248 ft.setOmitNorms( false );
249
250
251
252 doc.add( new Field( SearchItem.FIELD_URL, strUrl, ft ) );
253
254
255 doc.add( new Field( SearchItem.FIELD_DOCUMENT_PORTLET_ID, strPortletDocumentId, ft ) );
256
257
258
259
260 String strDate = DateTools.dateToString( document.getDateModification( ), DateTools.Resolution.DAY );
261 doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
262
263
264
265
266 String strIdDocument = String.valueOf( document.getId( ) );
267 doc.add( new Field( SearchItem.FIELD_UID, strIdDocument + "_" + DocumentIndexer.SHORT_NAME, ft ) );
268
269 String strContentToIndex = getContentToIndex( document );
270 int nWriteLimit = AppPropertiesService.getPropertyInt( PROPERTY_WRITER_MAX_FIELD_LENGTH, DEFAULT_WRITER_MAX_FIELD_LENGTH );
271 ContentHandler handler = new BodyContentHandler( nWriteLimit );
272 Metadata metadata = new Metadata( );
273
274 try
275 {
276 new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata,
277 new ParseContext( ) );
278 }
279 catch ( SAXException e )
280 {
281 throw new AppException( "Error during document parsing.", e );
282 }
283 catch ( TikaException e )
284 {
285 throw new AppException( "Error during document parsing.", e );
286 }
287
288
289
290 String strContent = handler.toString( );
291
292
293
294 doc.add( new Field( SearchItem.FIELD_CONTENTS, strContent, TextField.TYPE_NOT_STORED ) );
295
296
297
298 FieldType ft2 = new FieldType( TextField.TYPE_STORED );
299 ft2.setOmitNorms( true );
300 doc.add( new Field( SearchItem.FIELD_TITLE, document.getTitle( ), ft2 ) );
301
302 doc.add( new Field( SearchItem.FIELD_TYPE, document.getType( ), ft ) );
303
304 doc.add( new Field( SearchItem.FIELD_ROLE, strRole, ft ) );
305
306
307 doc.add( new Field( SearchItem.FIELD_METADATA, document.getSummary( ), TextField.TYPE_NOT_STORED ) );
308 doc.add( new StoredField( SearchItem.FIELD_SUMMARY, document.getSummary( ) ) );
309
310
311 return doc;
312 }
313
314
315
316
317
318
319 private static String getContentToIndex( Document document )
320 {
321 StringBuilder sbContentToIndex = new StringBuilder( );
322 sbContentToIndex.append( document.getTitle( ) );
323
324 for ( DocumentAttribute attribute : document.getAttributes( ) )
325 {
326 if ( attribute.isSearchable( ) )
327 {
328 if ( !attribute.isBinary( ) )
329 {
330
331 sbContentToIndex.append( " " );
332 sbContentToIndex.append( attribute.getTextValue( ) );
333 }
334 else
335 {
336
337
338 IFileIndexerFactory factoryIndexer = (IFileIndexerFactory) SpringContextService.getBean( IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY );
339 IFileIndexer indexer = factoryIndexer.getIndexer( attribute.getValueContentType( ) );
340
341 if ( indexer != null )
342 {
343 try
344 {
345 ByteArrayInputStream bais = new ByteArrayInputStream( attribute.getBinaryValue( ) );
346 sbContentToIndex.append( " " );
347 sbContentToIndex.append( indexer.getContentToIndex( bais ) );
348 bais.close( );
349 }
350 catch ( IOException e )
351 {
352 AppLogService.error( e.getMessage( ), e );
353 }
354 }
355 }
356 }
357 }
358
359
360 sbContentToIndex.append( " " );
361 sbContentToIndex.append( StringUtils.defaultString( document.getXmlMetadata( ) ) );
362
363 return sbContentToIndex.toString( );
364 }
365
366
367
368
369 @Override
370 public List<String> getListType( )
371 {
372 List<String> typeList = new ArrayList<String>( );
373
374 for ( ReferenceItem item : DocumentTypeHome.getDocumentTypesList( ) )
375 {
376 typeList.add( item.getName( ) );
377 }
378
379 return typeList;
380 }
381
382
383
384
385 @Override
386 public String getSpecificSearchAppUrl( )
387 {
388 return JSP_PAGE_ADVANCED_SEARCH;
389 }
390 }