View Javadoc
1   /*
2    * Copyright (c) 2002-2014, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.portal.service.search;
35  
36  import fr.paris.lutece.portal.business.page.Page;
37  import fr.paris.lutece.portal.business.page.PageHome;
38  import fr.paris.lutece.portal.service.message.SiteMessageException;
39  import fr.paris.lutece.portal.service.page.IPageService;
40  import fr.paris.lutece.portal.service.spring.SpringContextService;
41  import fr.paris.lutece.portal.service.util.AppException;
42  import fr.paris.lutece.portal.service.util.AppPropertiesService;
43  import fr.paris.lutece.util.url.UrlItem;
44  
45  import org.apache.commons.lang.StringUtils;
46  
47  import org.apache.lucene.document.DateTools;
48  import org.apache.lucene.document.Document;
49  import org.apache.lucene.document.Field;
50  import org.apache.lucene.document.FieldType;
51  import org.apache.lucene.document.StoredField;
52  import org.apache.lucene.document.StringField;
53  import org.apache.lucene.document.TextField;
54  
55  import org.apache.tika.exception.TikaException;
56  import org.apache.tika.metadata.Metadata;
57  import org.apache.tika.parser.ParseContext;
58  import org.apache.tika.parser.html.HtmlParser;
59  import org.apache.tika.sax.BodyContentHandler;
60  
61  import org.xml.sax.ContentHandler;
62  import org.xml.sax.SAXException;
63  
64  import java.io.ByteArrayInputStream;
65  import java.io.IOException;
66  
67  import java.util.ArrayList;
68  import java.util.List;
69  
70  
71  /**
72   * Indexer service for pages
73   */
74  public class PageIndexer implements SearchIndexer
75  {
76      public static final String INDEX_TYPE_PAGE = "Page";
77      public static final String INDEXER_NAME = "PageIndexer";
78      protected static final String PROPERTY_PAGE_BASE_URL = "search.pageIndexer.baseUrl";
79      protected static final String PROPERTY_SEARCH_PAGE_URL = "search.pageSearch.baseUrl";
80      protected static final String PROPERTY_INDEXER_ENABLE = "search.pageIndexer.enable";
81      protected static final String PARAMETER_PAGE_ID = "page_id";
82      protected static IPageService _pageService = (IPageService) SpringContextService.getBean( "pageService" );
83      private static final String INDEXER_DESCRIPTION = "Indexer service for pages";
84      private static final String INDEXER_VERSION = "1.0.0";
85  
86      /**
87       * {@inheritDoc}
88       */
89      @Override
90      public void indexDocuments(  ) throws IOException, InterruptedException, SiteMessageException
91      {
92          String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
93          List<Page> listPages = PageHome.getAllPages(  );
94  
95          for ( Page page : listPages )
96          {
97              UrlItem url = new UrlItem( strPageBaseUrl );
98              url.addParameter( PARAMETER_PAGE_ID, page.getId(  ) );
99  
100             Document doc = null;
101 
102             try
103             {
104                 doc = getDocument( page, url.getUrl(  ) );
105             }
106             catch ( Exception e )
107             {
108                 String strMessage = "Page ID : " + page.getId(  );
109                 IndexationService.error( this, e, strMessage );
110             }
111 
112             if ( doc != null )
113             {
114                 IndexationService.write( doc );
115             }
116         }
117     }
118 
119     /**
120      * {@inheritDoc}
121      */
122     @Override
123     public List<Document> getDocuments( String nIdDocument )
124         throws IOException, InterruptedException, SiteMessageException
125     {
126         ArrayList<Document> listDocuments = new ArrayList<Document>(  );
127         String strPageBaseUrl = AppPropertiesService.getProperty( PROPERTY_PAGE_BASE_URL );
128 
129         Page page = PageHome.getPage( Integer.parseInt( nIdDocument ) );
130 
131         if ( ( page != null ) && ( page.getId(  ) != 0 ) )
132         {
133             UrlItem url = new UrlItem( strPageBaseUrl );
134             url.addParameter( PARAMETER_PAGE_ID, page.getId(  ) );
135 
136             Document doc = getDocument( page, url.getUrl(  ) );
137             listDocuments.add( doc );
138         }
139 
140         return listDocuments;
141     }
142 
143     /**
144      * {@inheritDoc}
145      */
146     @Override
147     public String getName(  )
148     {
149         return INDEXER_NAME;
150     }
151 
152     /**
153      * {@inheritDoc}
154      */
155     @Override
156     public String getVersion(  )
157     {
158         return INDEXER_VERSION;
159     }
160 
161     /**
162      * {@inheritDoc}
163      */
164     @Override
165     public String getDescription(  )
166     {
167         return INDEXER_DESCRIPTION;
168     }
169 
170     /**
171      * {@inheritDoc}
172      */
173     @Override
174     public boolean isEnable(  )
175     {
176         String strEnable = AppPropertiesService.getProperty( PROPERTY_INDEXER_ENABLE, Boolean.TRUE.toString(  ) );
177 
178         return ( strEnable.equalsIgnoreCase( Boolean.TRUE.toString(  ) ) );
179     }
180 
181     /**
182      * Builds a document which will be used by Lucene during the indexing of the
183      * pages of the site with the following
184      * fields : summary, uid, url, contents, title and description.
185      * @return the built Document
186      * @param strUrl The base URL for documents
187      * @param page the page to index
188      * @throws IOException The IO Exception
189      * @throws InterruptedException The InterruptedException
190      * @throws SiteMessageException occurs when a site message need to be
191      *             displayed
192      */
193     protected Document getDocument( Page page, String strUrl )
194         throws IOException, InterruptedException, SiteMessageException
195     {
196         FieldType ft = new FieldType( StringField.TYPE_STORED );
197         ft.setOmitNorms( false );
198 
199         FieldType ftNotStored = new FieldType( StringField.TYPE_NOT_STORED );
200         ftNotStored.setOmitNorms( false );
201         ftNotStored.setTokenized( false );
202 
203         // make a new, empty document
204         Document doc = new Document(  );
205 
206         // Add the url as a field named "url".  Use an UnIndexed field, so
207         // that the url is just stored with the document, but is not searchable.
208         doc.add( new Field( SearchItem.FIELD_URL, strUrl, ft ) );
209 
210         // Add the last modified date of the file a field named "modified".
211         // Use a field that is indexed (i.e. searchable), but don't tokenize
212         // the field into words.
213         String strDate = DateTools.dateToString( page.getDateUpdate(  ), DateTools.Resolution.DAY );
214         doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
215 
216         // Add the uid as a field, so that index can be incrementally maintained.
217         // This field is not stored with document, it is indexed, but it is not
218         // tokenized prior to indexing.
219         String strIdPage = String.valueOf( page.getId(  ) );
220         doc.add( new Field( SearchItem.FIELD_UID, strIdPage, ftNotStored ) );
221 
222         String strPageContent = _pageService.getPageContent( page.getId(  ), 0, null );
223         ContentHandler handler = new BodyContentHandler(  );
224         Metadata metadata = new Metadata(  );
225 
226         try
227         {
228             new HtmlParser(  ).parse( new ByteArrayInputStream( strPageContent.getBytes(  ) ), handler, metadata,
229                 new ParseContext(  ) );
230         }
231         catch ( SAXException e )
232         {
233             throw new AppException( "Error during page parsing." );
234         }
235         catch ( TikaException e )
236         {
237             throw new AppException( "Error during page parsing." );
238         }
239 
240         //the content of the article is recovered in the parser because this one
241         //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
242         StringBuilder sb = new StringBuilder( handler.toString(  ) );
243 
244         // Add the tag-stripped contents as a Reader-valued Text field so it will
245         // get tokenized and indexed.
246         StringBuilder sbFieldContent = new StringBuilder(  );
247         StringBuilder sbFieldMetadata = new StringBuilder(  );
248         sbFieldContent.append( page.getName(  ) ).append( " " ).append( sb.toString(  ) );
249 
250         // Add the metadata description of the page if it exists
251         if ( page.getDescription(  ) != null )
252         {
253             sbFieldContent.append( " " ).append( page.getDescription(  ) );
254         }
255 
256         // Add the metadata keywords of the page if it exists
257         String strMetaKeywords = page.getMetaKeywords(  );
258 
259         if ( StringUtils.isNotBlank( strMetaKeywords ) )
260         {
261             sbFieldContent.append( " " ).append( strMetaKeywords );
262             sbFieldMetadata.append( strMetaKeywords );
263         }
264 
265         doc.add( new Field( SearchItem.FIELD_CONTENTS, sbFieldContent.toString(  ), TextField.TYPE_NOT_STORED ) );
266 
267         if ( StringUtils.isNotBlank( page.getMetaDescription(  ) ) )
268         {
269             if ( sbFieldMetadata.length(  ) > 0 )
270             {
271                 sbFieldMetadata.append( " " );
272             }
273 
274             sbFieldMetadata.append( page.getMetaDescription(  ) );
275         }
276 
277         if ( sbFieldMetadata.length(  ) > 0 )
278         {
279             doc.add( new StringField( SearchItem.FIELD_METADATA, sbFieldMetadata.toString(  ), Field.Store.NO ) );
280         }
281 
282         // Add the title as a separate Text field, so that it can be searched
283         // separately.
284         doc.add( new Field( SearchItem.FIELD_TITLE, page.getName(  ), ft ) );
285 
286         if ( StringUtils.isNotBlank( page.getDescription(  ) ) )
287         {
288             // Add the summary as an UnIndexed field, so that it is stored and returned
289             // with hit documents for display.
290             doc.add( new StoredField( SearchItem.FIELD_SUMMARY, page.getDescription(  ) ) );
291         }
292 
293         doc.add( new Field( SearchItem.FIELD_TYPE, INDEX_TYPE_PAGE, ft ) );
294         doc.add( new Field( SearchItem.FIELD_ROLE, page.getRole(  ), ft ) );
295 
296         // return the document
297         return doc;
298     }
299 
300     /**
301      * {@inheritDoc}
302      */
303     @Override
304     public List<String> getListType(  )
305     {
306         List<String> listType = new ArrayList<String>(  );
307         listType.add( INDEX_TYPE_PAGE );
308 
309         return listType;
310     }
311 
312     /**
313      * {@inheritDoc}
314      */
315     @Override
316     public String getSpecificSearchAppUrl(  )
317     {
318         return AppPropertiesService.getProperty( PROPERTY_SEARCH_PAGE_URL );
319     }
320 }