1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.portal.service.search;
35
36 import fr.paris.lutece.portal.business.page.Page;
37 import fr.paris.lutece.portal.business.page.PageHome;
38 import fr.paris.lutece.portal.service.message.SiteMessageException;
39 import fr.paris.lutece.portal.service.page.IPageService;
40 import fr.paris.lutece.portal.service.spring.SpringContextService;
41 import fr.paris.lutece.portal.service.util.AppException;
42 import fr.paris.lutece.portal.service.util.AppPropertiesService;
43 import fr.paris.lutece.util.url.UrlItem;
44
45 import org.apache.commons.lang.StringUtils;
46
47 import org.apache.lucene.document.DateTools;
48 import org.apache.lucene.document.Document;
49 import org.apache.lucene.document.Field;
50 import org.apache.lucene.document.FieldType;
51 import org.apache.lucene.document.StoredField;
52 import org.apache.lucene.document.StringField;
53 import org.apache.lucene.document.TextField;
54
55 import org.apache.tika.exception.TikaException;
56 import org.apache.tika.metadata.Metadata;
57 import org.apache.tika.parser.ParseContext;
58 import org.apache.tika.parser.html.HtmlParser;
59 import org.apache.tika.sax.BodyContentHandler;
60
61 import org.xml.sax.ContentHandler;
62 import org.xml.sax.SAXException;
63
64 import java.io.ByteArrayInputStream;
65 import java.io.IOException;
66
67 import java.util.ArrayList;
68 import java.util.List;
69
70
71
72
73
74 public class PageIndexer implements SearchIndexer
75 {
76 public static final String INDEX_TYPE_PAGE = "Page";
77 public static final String INDEXER_NAME = "PageIndexer";
78 protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
79 protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
80 protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
81 protected static final String PARAMETER_PAGE_ID = "page_id";
82 protected static IPageService _pageService = (IPageService) SpringContextService.getBean( "pageService" );
83 private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
84 private static final String INDEXER_VERSION = "1.0.0";
85
86
87
88
89 @Override
90 public void indexDocuments( ) throws IOException, InterruptedException, SiteMessageException
91 {
92 String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
93 List<Page> listPages = PageHome.getAllPages( );
94
95 for ( Page page : listPages )
96 {
97 UrlItem url = new UrlItem( strPageBaseUrl );
98 url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
99
100 Document doc = null;
101
102 try
103 {
104 doc = getDocument( page, url.getUrl( ) );
105 }
106 catch ( Exception e )
107 {
108 String strMessage = "Page ID : " + page.getId( );
109 IndexationService.error( this, e, strMessage );
110 }
111
112 if ( doc != null )
113 {
114 IndexationService.write( doc );
115 }
116 }
117 }
118
119
120
121
122 @Override
123 public List<Document> getDocuments( String nIdDocument )
124 throws IOException, InterruptedException, SiteMessageException
125 {
126 ArrayList<Document> listDocuments = new ArrayList<Document>( );
127 String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
128
129 Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );
130
131 if ( ( page != null ) && ( page.getId( ) != 0 ) )
132 {
133 UrlItem url = new UrlItem( strPageBaseUrl );
134 url.addParameter( PARAMETER_PAGE_ID, page.getId( ) );
135
136 Document doc = getDocument( page, url.getUrl( ) );
137 listDocuments.add( doc );
138 }
139
140 return listDocuments;
141 }
142
143
144
145
146 @Override
147 public String getName( )
148 {
149 return INDEXER_NAME;
150 }
151
152
153
154
155 @Override
156 public String getVersion( )
157 {
158 return INDEXER_VERSION;
159 }
160
161
162
163
164 @Override
165 public String getDescription( )
166 {
167 return INDEXER_DESCRIPTION;
168 }
169
170
171
172
173 @Override
174 public boolean isEnable( )
175 {
176 String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString( ) );
177
178 return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString( ) ) );
179 }
180
181
182
183
184
185
186
187
188
189
190
191
192
193 protected Document getDocument( Page page, String strUrl )
194 throws IOException, InterruptedException, SiteMessageException
195 {
196 FieldType ft = new FieldType( StringField.TYPE_STORED );
197 ft.setOmitNorms( false );
198
199 FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
200 ftNotStored.setOmitNorms( false );
201 ftNotStored.setTokenized( false );
202
203
204 Document doc = new Document( );
205
206
207
208 doc.add( new Field( SearchItem.FIELD_URL, strUrl, ft ) );
209
210
211
212
213 String strDate = DateTools.dateToString( page.getDateUpdate( ), DateTools.Resolution.DAY );
214 doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
215
216
217
218
219 String strIdPage = String.valueOf( page.getId( ) );
220 doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftNotStored ) );
221
222 String strPageContent = _pageService.getPageContent( page.getId( ), 0, null );
223 ContentHandler handler = new BodyContentHandler( );
224 Metadata metadata = new Metadata( );
225
226 try
227 {
228 new HtmlParser( ).parse( new ByteArrayInputStream( strPageContent.getBytes( ) ), handler, metadata,
229 new ParseContext( ) );
230 }
231 catch ( SAXException e )
232 {
233 throw new AppException( "Error during page parsing." );
234 }
235 catch ( TikaException e )
236 {
237 throw new AppException( "Error during page parsing." );
238 }
239
240
241
242 StringBuilder sb = new StringBuilder( handler.toString( ) );
243
244
245
246 StringBuilder sbFieldContent = new StringBuilder( );
247 StringBuilder sbFieldMetadata = new StringBuilder( );
248 sbFieldContent.append( page.getName( ) ).append( " " ).append( sb.toString( ) );
249
250
251 if ( page.getDescription( ) != null )
252 {
253 sbFieldContent.append( " " ).append( page.getDescription( ) );
254 }
255
256
257 String strMetaKeywords = page.getMetaKeywords( );
258
259 if ( StringUtils.isNotBlank( strMetaKeywords ) )
260 {
261 sbFieldContent.append( " " ).append( strMetaKeywords );
262 sbFieldMetadata.append( strMetaKeywords );
263 }
264
265 doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString( ), TextField.TYPE_NOT_STORED ) );
266
267 if ( StringUtils.isNotBlank( page.getMetaDescription( ) ) )
268 {
269 if ( sbFieldMetadata.length( ) > 0 )
270 {
271 sbFieldMetadata.append( " " );
272 }
273
274 sbFieldMetadata.append( page.getMetaDescription( ) );
275 }
276
277 if ( sbFieldMetadata.length( ) > 0 )
278 {
279 doc.add( new StringField( SearchItem.FIELD_METADATA, sbFieldMetadata.toString( ), Field.Store.NO ) );
280 }
281
282
283
284 doc.add( new Field( SearchItem.FIELD_TITLE, page.getName( ), ft ) );
285
286 if ( StringUtils.isNotBlank( page.getDescription( ) ) )
287 {
288
289
290 doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription( ) ) );
291 }
292
293 doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
294 doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole( ), ft ) );
295
296
297 return doc;
298 }
299
300
301
302
303 @Override
304 public List<String> getListType( )
305 {
306 List<String> listType = new ArrayList<String>( );
307 listType.add( INDEX_TYPE_PAGE );
308
309 return listType;
310 }
311
312
313
314
315 @Override
316 public String getSpecificSearchAppUrl( )
317 {
318 return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
319 }
320 }