1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.plugins.directories.modules.solr.indexer;
35
36 import fr.paris.lutece.plugins.directories.business.DirectoryEntity;
37 import fr.paris.lutece.plugins.directories.service.DirectoriesService;
38 import fr.paris.lutece.plugins.directories.util.DirectoriesUtils;
39 import fr.paris.lutece.plugins.genericattributes.business.Entry;
40 import fr.paris.lutece.plugins.genericattributes.business.FieldHome;
41 import fr.paris.lutece.plugins.genericattributes.business.Response;
42 import fr.paris.lutece.plugins.search.solr.business.field.Field;
43 import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexer;
44 import fr.paris.lutece.plugins.search.solr.indexer.SolrIndexerService;
45 import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
46 import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
47 import fr.paris.lutece.portal.service.util.AppException;
48 import fr.paris.lutece.portal.service.util.AppLogService;
49 import fr.paris.lutece.portal.service.util.AppPropertiesService;
50 import fr.paris.lutece.util.url.UrlItem;
51 import javassist.expr.NewArray;
52
53 import org.apache.tika.exception.TikaException;
54 import org.apache.tika.metadata.Metadata;
55 import org.apache.tika.parser.ParseContext;
56 import org.apache.tika.parser.html.HtmlParser;
57 import org.apache.tika.sax.BodyContentHandler;
58 import org.xml.sax.ContentHandler;
59 import org.xml.sax.SAXException;
60 import java.io.ByteArrayInputStream;
61 import java.io.IOException;
62 import java.text.Normalizer;
63 import java.util.ArrayList;
64 import java.util.Arrays;
65 import java.util.Collection;
66 import java.util.GregorianCalendar;
67 import java.util.List;
68 import java.util.Map;
69 import java.util.regex.Pattern;
70 import java.util.stream.Collectors;
71
72
73
74
75
76 public class SolrDocIndexer implements SolrIndexer
77 {
78 public static final String BEAN_NAME = "directories-solr.solrDocIndexer";
79 private static final String TYPE = "directories";
80 private static final String PARAMETER_ENTITY_ID = "entity_id";
81 private static final String PROPERTY_INDEXER_ENABLE = "solr.indexer.document.enable";
82 private static final String PROPERTY_DOCUMENT_MAX_CHARS = "directories-solr.indexer.document.characters.limit";
83 private static final String PROPERTY_NAME = "directories-solr.indexer.name";
84 private static final String PROPERTY_DESCRIPTION = "directories-solr.indexer.description";
85 private static final String PROPERTY_VERSION = "directories-solr.indexer.version";
86 private static final String PARAMETER_XPAGE = "page";
87 private static final String XPAGE_DIRECTORIES = "directories";
88 private static final String PARAMETER_VIEW = "view";
89 private static final String PARAMETER_VIEW_ENTITY = "viewDirectoryEntity";
90 private static final List<String> LIST_RESSOURCES_NAME = new ArrayList<String>( );
91 private static final String SHORT_NAME = "entity";
92 private static final String DOC_INDEXATION_ERROR = "[SolrDirectoriesIndexer] An error occured during the indexation of the document number ";
93 private static final Integer PARAMETER_DOCUMENT_MAX_CHARS = Integer.parseInt( AppPropertiesService.getProperty( PROPERTY_DOCUMENT_MAX_CHARS ) );
94
95
96
97
98 public SolrDocIndexer( )
99 {
100 LIST_RESSOURCES_NAME.add( DirectoriesUtils.CONSTANT_TYPE_RESOURCE );
101 }
102
103 @Override
104 public boolean isEnable( )
105 {
106 return "true".equalsIgnoreCase( AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE ) );
107 }
108
109
110
111
112 @Override
113 public List<String> indexDocuments( )
114 {
115 List<String> lstErrors = new ArrayList<String>( );
116 List<Integer> listDocument = new ArrayList<Integer>( );
117 Collection<SolrItem> solrItems = new ArrayList<SolrItem>( );
118 for ( DirectoryEntity document : DirectoriesService.getInstance( ).getListDocWithoutBinaries( ) )
119 {
120 try
121 {
122 if ( !listDocument.contains( document.getId( ) ) )
123 {
124
125 SolrItem item = getItem( document );
126 if ( item != null )
127 {
128 solrItems.add( item );
129 }
130 listDocument.add( document.getId( ) );
131 }
132 }
133 catch( Exception e )
134 {
135 lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
136 AppLogService.error( DOC_INDEXATION_ERROR + document.getId( ), e );
137 }
138 }
139 try
140 {
141 SolrIndexerService.write( solrItems );
142 }
143 catch( Exception e )
144 {
145 lstErrors.add( SolrIndexerService.buildErrorMessage( e ) );
146 AppLogService.error( DOC_INDEXATION_ERROR, e );
147 }
148 return lstErrors;
149 }
150
151
152
153
154
155
156
157
158
159
160
161 private SolrItem getItem( DirectoryEntity document ) throws IOException
162 {
163
164 SolrItem item = new SolrItem( );
165 item.setUid( getResourceUid( Integer.valueOf( document.getId( ) ).toString( ), DirectoriesUtils.CONSTANT_TYPE_RESOURCE ) );
166 item.setDate( document.getCreation( ) );
167 item.setType( TYPE );
168 item.setSite( SolrIndexerService.getWebAppName( ) );
169 item.setRole( "none" );
170 item.setTitle( document.getTitle( ) );
171
172 UrlItem url = new UrlItem( SolrIndexerService.getBaseUrl( ) );
173 url.addParameter( PARAMETER_XPAGE, XPAGE_DIRECTORIES );
174 url.addParameter( PARAMETER_VIEW, PARAMETER_VIEW_ENTITY );
175 url.addParameter( PARAMETER_ENTITY_ID, document.getId( ) );
176 item.setUrl( url.getUrl( ) );
177
178 GregorianCalendar calendar = new GregorianCalendar( );
179 calendar.setTime( document.getCreation( ) );
180 item.setHieDate( calendar.get( GregorianCalendar.YEAR ) + "/" + ( calendar.get( GregorianCalendar.MONTH ) + 1 ) + "/"
181 + calendar.get( GregorianCalendar.DAY_OF_MONTH ) );
182
183 String strContentToIndex = getContentToIndex( document, item );
184 ContentHandler handler = null;
185 if ( PARAMETER_DOCUMENT_MAX_CHARS != null )
186 {
187 handler = new BodyContentHandler( PARAMETER_DOCUMENT_MAX_CHARS );
188 }
189 else
190 {
191 handler = new BodyContentHandler( );
192 }
193 Metadata metadata = new Metadata( );
194 try
195 {
196 new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata, new ParseContext( ) );
197 }
198 catch( SAXException e )
199 {
200 throw new AppException( "Error during document parsing." );
201 }
202 catch( TikaException e )
203 {
204 throw new AppException( "Error during document parsing." );
205 }
206 item.setContent( handler.toString( ) );
207 return item;
208 }
209
210
211
212
213
214
215
216
217
218
219 private static String getContentToIndex( DirectoryEntity document, SolrItem item )
220 {
221 StringBuilder sbContentToIndex = new StringBuilder( );
222 List<Response> listResponse = document.getResponses( );
223
224 Map<Entry, List<Response>> entryMap = listResponse.stream( ).collect( Collectors.groupingBy( Response::getEntry ) );
225
226 entryMap.forEach( ( entry, listResponseFiltered ) -> {
227
228 String strFieldName = "attribute" + listResponseFiltered.get( 0 ).getEntry( ).getIdEntry( );
229 List<String> valueList = new ArrayList<>( );
230
231 if ( listResponseFiltered.get( 0 ).getField( ) != null )
232 {
233 for ( Response response : listResponseFiltered )
234 {
235 int nIdField = response.getField( ).getIdField( );
236 fr.paris.lutece.plugins.genericattributes.business.Field field = FieldHome.findByPrimaryKey( nIdField );
237 String value = field.getTitle( );
238 if ( value == null )
239 {
240 value = response.getResponseValue( );
241 }
242 valueList.add( value );
243 sbContentToIndex.append( " " );
244 sbContentToIndex.append( value );
245 }
246 item.addDynamicField( strFieldName, valueList );
247 }
248 else
249 {
250 String value = listResponseFiltered.get( 0 ).getResponseValue( );
251 if ( value != null )
252 {
253 item.addDynamicField( strFieldName, value );
254 item.addDynamicFieldNotAnalysed( strFieldName, value );
255 sbContentToIndex.append( " " );
256 sbContentToIndex.append( value );
257 }
258 }
259 } );
260
261 String strContentDistinct = Arrays.stream( sbContentToIndex.toString( ).split( "\\s+" ) ).distinct( ).collect( Collectors.joining( " " ) );
262 String newStrContentDistinct = strContentDistinct.replaceAll( "null", "" );
263 StringBuilder sb = new StringBuilder( newStrContentDistinct );
264
265 return sb.toString( );
266 }
267
268
269
270
271
272
273
274 @Override
275 public String getName( )
276 {
277 return AppPropertiesService.getProperty( PROPERTY_NAME );
278 }
279
280
281
282
283
284
285 @Override
286 public String getVersion( )
287 {
288 return AppPropertiesService.getProperty( PROPERTY_VERSION );
289 }
290
291
292
293
294 @Override
295 public String getDescription( )
296 {
297 return AppPropertiesService.getProperty( PROPERTY_DESCRIPTION );
298 }
299
300
301
302
303 @Override
304 public List<Field> getAdditionalFields( )
305 {
306 List<Field> lstFields = new ArrayList<Field>( );
307 return lstFields;
308 }
309
310
311
312
313 @Override
314 public List<SolrItem> getDocuments( String strIdDocument )
315 {
316 List<SolrItem> lstItems = new ArrayList<SolrItem>( );
317 return lstItems;
318 }
319
320
321
322
323 @Override
324 public List<String> getResourcesName( )
325 {
326 return LIST_RESSOURCES_NAME;
327 }
328
329
330
331
332 @Override
333 public String getResourceUid( String strResourceId, String strResourceType )
334 {
335 StringBuilder sb = new StringBuilder( strResourceId );
336 sb.append( SolrConstants.CONSTANT_UNDERSCORE ).append( SHORT_NAME );
337 return sb.toString( );
338 }
339
340 }