1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.portal.service.search;
35
36 import fr.paris.lutece.portal.business.page.Page;
37 import fr.paris.lutece.portal.business.page.PageHome;
38 import fr.paris.lutece.portal.service.message.SiteMessageException;
39 import fr.paris.lutece.portal.service.page.IPageService;
40 import fr.paris.lutece.portal.service.spring.SpringContextService;
41 import fr.paris.lutece.portal.service.util.AppException;
42 import fr.paris.lutece.portal.service.util.AppPropertiesService;
43 import fr.paris.lutece.util.url.UrlItem;
44 import org.apache.lucene.index.IndexOptions;
45
46 import org.apache.commons.lang3.StringUtils;
47
48 import org.apache.lucene.document.DateTools;
49 import org.apache.lucene.document.Document;
50 import org.apache.lucene.document.Field;
51 import org.apache.lucene.document.FieldType;
52 import org.apache.lucene.document.StoredField;
53 import org.apache.lucene.document.StringField;
54 import org.apache.lucene.document.TextField;
55
56 import org.apache.tika.exception.TikaException;
57 import org.apache.tika.metadata.Metadata;
58 import org.apache.tika.parser.ParseContext;
59 import org.apache.tika.parser.html.HtmlParser;
60 import org.apache.tika.sax.BodyContentHandler;
61
62 import org.xml.sax.ContentHandler;
63 import org.xml.sax.SAXException;
64
65 import java.io.ByteArrayInputStream;
66 import java.io.IOException;
67
68 import java.util.ArrayList;
69 import java.util.List;
70
71
72
73
74 public class PageIndexer implements SearchIndexer
75 {
76 public static final String INDEX_TYPE_PAGE = "Page";
77 public static final String INDEXER_NAME = "PageIndexer";
78 protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
79 protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
80 protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
81 protected static final String PARAMETER_PAGE_ID = "page_id";
82 private static IPageService _pageService = SpringContextService.getBean( "pageService" );
83 private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
84 private static final String INDEXER_VERSION = "1.0.0";
85
86
87
88
89 @Override
90 public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
91 {
92 String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
93 List<Page> listPages = PageHome.getAllPages( );
94
95 for ( Page page : listPages )
96 {
97 UrlItem/url/UrlItem.html#UrlItem">UrlItem url = new UrlItem( strPageBaseUrl );
98 url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
99
100 Document doc = null;
101
102 try
103 {
104 doc = getDocument( page, url.getUrl( ) );
105 }
106 catch( Exception e )
107 {
108 String strMessage = "Page ID : " + page.getId( );
109 IndexationService.error( this, e, strMessage );
110 }
111
112 if ( doc != null )
113 {
114 IndexationService.write( doc );
115 }
116 }
117 }
118
119
120
121
122 @Override
123 public List<Document> getDocuments( String nIdDocument ) throws IOException, InterruptedException, SiteMessageException
124 {
125 ArrayList<Document> listDocuments = new ArrayList<>( );
126 String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
127
128 Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );
129
130 if ( ( page != null ) && ( page.getId( ) != 0 ) )
131 {
132 UrlItem/url/UrlItem.html#UrlItem">UrlItem url = new UrlItem( strPageBaseUrl );
133 url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
134
135 Document doc = getDocument( page, url.getUrl( ) );
136 listDocuments.add( doc );
137 }
138
139 return listDocuments;
140 }
141
142
143
144
145 @Override
146 public String getName( )
147 {
148 return INDEXER_NAME;
149 }
150
151
152
153
154 @Override
155 public String getVersion( )
156 {
157 return INDEXER_VERSION;
158 }
159
160
161
162
163 @Override
164 public String getDescription( )
165 {
166 return INDEXER_DESCRIPTION;
167 }
168
169
170
171
172 @Override
173 public boolean isEnable( )
174 {
175 String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString( ) );
176
177 return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) );
178 }
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196 protected Document getDocument( Page page, String strUrl ) throws IOException, InterruptedException, SiteMessageException
197 {
198 FieldType ft = new FieldType( StringField.TYPE_STORED );
199 ft.setOmitNorms( false );
200
201 FieldType ftdfp = new FieldType( StringField.TYPE_STORED );
202 ftdfp.setOmitNorms( false );
203 ftdfp.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS );
204
205 FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
206 ftNotStored.setOmitNorms( false );
207 ftNotStored.setTokenized( false );
208
209
210 Document doc = new Document( );
211
212
213
214 doc.add( new StoredField( SearchItem.FIELD_URL, strUrl ) );
215
216
217
218
219 String strDate = DateTools.dateToString( page.getDateUpdate( ), DateTools.Resolution.DAY );
220 doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
221
222
223
224
225 String strIdPage = String.valueOf( page.getId( ) );
226 doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftNotStored ) );
227
228 String strPageContent = _pageService.getPageContent( page.getId( ), 0, null );
229 ContentHandler handler = new BodyContentHandler( );
230 Metadata metadata = new Metadata( );
231
232 try
233 {
234 new HtmlParser( ).parse( new ByteArrayInputStream( strPageContent.getBytes( ) ), handler, metadata, new ParseContext( ) );
235 }
236 catch( TikaException | SAXException e )
237 {
238 throw new AppException( "Error during page parsing." );
239 }
240
241
242
243 StringBuilder sb = new StringBuilder( handler.toString( ) );
244
245
246
247 StringBuilder sbFieldContent = new StringBuilder( );
248 StringBuilder sbFieldMetadata = new StringBuilder( );
249 sbFieldContent.append( page.getName( ) ).append( " " ).append( sb.toString( ) );
250
251
252 if ( page.getDescription( ) != null )
253 {
254 sbFieldContent.append( " " ).append( page.getDescription( ) );
255 }
256
257
258 String strMetaKeywords = page.getMetaKeywords( );
259
260 if ( StringUtils.isNotBlank( strMetaKeywords ) )
261 {
262 sbFieldContent.append( " " ).append( strMetaKeywords );
263 sbFieldMetadata.append( strMetaKeywords );
264 }
265
266 doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString( ), TextField.TYPE_NOT_STORED ) );
267
268 if ( StringUtils.isNotBlank( page.getMetaDescription( ) ) )
269 {
270 if ( sbFieldMetadata.length( ) > 0 )
271 {
272 sbFieldMetadata.append( " " );
273 }
274
275 sbFieldMetadata.append( page.getMetaDescription( ) );
276 }
277
278 if ( sbFieldMetadata.length( ) > 0 )
279 {
280 doc.add( new Field( SearchItem.FIELD_METADATA, sbFieldMetadata.toString( ), TextField.TYPE_NOT_STORED ) );
281 }
282
283
284
285 doc.add( new Field( SearchItem.FIELD_TITLE, page.getName( ), ftdfp ) );
286
287 if ( StringUtils.isNotBlank( page.getDescription( ) ) )
288 {
289
290
291 doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription( ) ) );
292 }
293
294 doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
295 doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole( ), ft ) );
296
297
298 return doc;
299 }
300
301
302
303
304 @Override
305 public List<String> getListType( )
306 {
307 List<String> listType = new ArrayList<>( );
308 listType.add( INDEX_TYPE_PAGE );
309
310 return listType;
311 }
312
313
314
315
316 @Override
317 public String getSpecificSearchAppUrl( )
318 {
319 return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
320 }
321 }