1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.portal.service.search;
35
36 import fr.paris.lutece.portal.business.page.Page;
37 import fr.paris.lutece.portal.business.page.PageHome;
38 import fr.paris.lutece.portal.service.message.SiteMessageException;
39 import fr.paris.lutece.portal.service.page.IPageService;
40 import fr.paris.lutece.portal.service.spring.SpringContextService;
41 import fr.paris.lutece.portal.service.util.AppException;
42 import fr.paris.lutece.portal.service.util.AppPropertiesService;
43 import fr.paris.lutece.util.url.UrlItem;
44 import org.apache.lucene.index.IndexOptions;
45
46 import org.apache.commons.lang3.StringUtils;
47
48 import org.apache.lucene.document.DateTools;
49 import org.apache.lucene.document.Document;
50 import org.apache.lucene.document.Field;
51 import org.apache.lucene.document.FieldType;
52 import org.apache.lucene.document.StoredField;
53 import org.apache.lucene.document.StringField;
54 import org.apache.lucene.document.TextField;
55
56 import org.apache.tika.exception.TikaException;
57 import org.apache.tika.metadata.Metadata;
58 import org.apache.tika.parser.ParseContext;
59 import org.apache.tika.parser.html.HtmlParser;
60 import org.apache.tika.sax.BodyContentHandler;
61
62 import org.xml.sax.ContentHandler;
63 import org.xml.sax.SAXException;
64
65 import java.io.ByteArrayInputStream;
66 import java.io.IOException;
67
68 import java.util.ArrayList;
69 import java.util.List;
70
71
72
73
74 public class PageIndexer implements SearchIndexer
75 {
76 public static final String INDEX_TYPE_PAGE = "Page";
77 public static final String INDEXER_NAME = "PageIndexer";
78 protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
79 protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
80 protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
81 protected static final String PARAMETER_PAGE_ID = "page_id";
82 private static IPageService _pageService = SpringContextService.getBean( "pageService" );
83 private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
84 private static final String INDEXER_VERSION = "1.0.0";
85
86
87
88
89 @Override
90 public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
91 {
92 String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
93 List<Page> listPages = PageHome.getAllPages( );
94
95 for ( Page page : listPages )
96 {
97 UrlItem/url/UrlItem.html#UrlItem">UrlItem url = new UrlItem( strPageBaseUrl );
98 url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
99
100 Document doc = null;
101
102 try
103 {
104 doc = getDocument( page, url.getUrl( ) );
105 }
106 catch( Exception e )
107 {
108 String strMessage = "Page ID : " + page.getId( );
109 IndexationService.error( this, e, strMessage );
110 }
111
112 if ( doc != null )
113 {
114 IndexationService.write( doc );
115 }
116 }
117 }
118
119
120
121
122 @Override
123 public List<Document> getDocuments( String nIdDocument ) throws IOException, InterruptedException, SiteMessageException
124 {
125 ArrayList<Document> listDocuments = new ArrayList<>( );
126 String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
127
128 Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );
129
130 if ( ( page != null ) && ( page.getId( ) != 0 ) )
131 {
132 UrlItem/url/UrlItem.html#UrlItem">UrlItem url = new UrlItem( strPageBaseUrl );
133 url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
134
135 Document doc = getDocument( page, url.getUrl( ) );
136 listDocuments.add( doc );
137 }
138
139 return listDocuments;
140 }
141
142
143
144
145 @Override
146 public String getName( )
147 {
148 return INDEXER_NAME;
149 }
150
151
152
153
154 @Override
155 public String getVersion( )
156 {
157 return INDEXER_VERSION;
158 }
159
160
161
162
163 @Override
164 public String getDescription( )
165 {
166 return INDEXER_DESCRIPTION;
167 }
168
169
170
171
172 @Override
173 public boolean isEnable( )
174 {
175 String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString( ) );
176
177 return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) );
178 }
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196 protected Document getDocument( Page page, String strUrl ) throws IOException, InterruptedException, SiteMessageException
197 {
198 FieldType ft = new FieldType( StringField.TYPE_STORED );
199 ft.setOmitNorms( false );
200
201 FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
202 ftNotStored.setOmitNorms( false );
203 ftNotStored.setTokenized( false );
204
205 FieldType ftDate = new FieldType( StringField.TYPE_STORED );
206 ftDate.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
207 ftDate.setStored( true );
208 ftDate.setOmitNorms( false );
209
210 FieldType ftUid = ftNotStored;
211 ftUid.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
212
213 Document doc = new Document( );
214
215
216
217 doc.add( new StoredField( SearchItem.FIELD_URL, strUrl ) );
218
219
220
221
222 String strDate = DateTools.dateToString( page.getDateUpdate( ), DateTools.Resolution.DAY );
223 doc.add( new Field( SearchItem.FIELD_DATE, strDate, ftDate ) );
224
225
226
227
228 String strIdPage = String.valueOf( page.getId( ) );
229 doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftUid ) );
230
231 String strPageContent = _pageService.getPageContent( page.getId( ), 0, null );
232 ContentHandler handler = new BodyContentHandler( );
233 Metadata metadata = new Metadata( );
234
235 try
236 {
237 new HtmlParser( ).parse( new ByteArrayInputStream( strPageContent.getBytes( ) ), handler, metadata, new ParseContext( ) );
238 }
239 catch( TikaException | SAXException e )
240 {
241 throw new AppException( "Error during page parsing." );
242 }
243
244
245
246 StringBuilder sb = new StringBuilder( handler.toString( ) );
247
248
249
250 StringBuilder sbFieldContent = new StringBuilder( );
251 StringBuilder sbFieldMetadata = new StringBuilder( );
252 sbFieldContent.append( page.getName( ) ).append( " " ).append( sb.toString( ) );
253
254
255 if ( page.getDescription( ) != null )
256 {
257 sbFieldContent.append( " " ).append( page.getDescription( ) );
258 }
259
260
261 String strMetaKeywords = page.getMetaKeywords( );
262
263 if ( StringUtils.isNotBlank( strMetaKeywords ) )
264 {
265 sbFieldContent.append( " " ).append( strMetaKeywords );
266 sbFieldMetadata.append( strMetaKeywords );
267 }
268
269 doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString( ), TextField.TYPE_NOT_STORED ) );
270
271 if ( StringUtils.isNotBlank( page.getMetaDescription( ) ) )
272 {
273 if ( sbFieldMetadata.length( ) > 0 )
274 {
275 sbFieldMetadata.append( " " );
276 }
277
278 sbFieldMetadata.append( page.getMetaDescription( ) );
279 }
280
281 if ( sbFieldMetadata.length( ) > 0 )
282 {
283 doc.add( new Field( SearchItem.FIELD_METADATA, sbFieldMetadata.toString( ), TextField.TYPE_NOT_STORED ) );
284 }
285
286
287
288 doc.add( new Field( SearchItem.FIELD_TITLE, page.getName( ), ft ) );
289
290 if ( StringUtils.isNotBlank( page.getDescription( ) ) )
291 {
292
293
294 doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription( ) ) );
295 }
296
297 doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
298 doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole( ), ft ) );
299
300
301 return doc;
302 }
303
304
305
306
307 @Override
308 public List<String> getListType( )
309 {
310 List<String> listType = new ArrayList<>( );
311 listType.add( INDEX_TYPE_PAGE );
312
313 return listType;
314 }
315
316
317
318
319 @Override
320 public String getSpecificSearchAppUrl( )
321 {
322 return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
323 }
324 }