View Javadoc
1   /*
2    * Copyright (c) 2002-2023, City of Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.document.service.docsearch;
35  
36  import fr.paris.lutece.plugins.document.business.Document;
37  import fr.paris.lutece.plugins.document.business.DocumentHome;
38  import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
39  import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
40  import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
41  import fr.paris.lutece.portal.service.search.SearchItem;
42  import fr.paris.lutece.portal.service.spring.SpringContextService;
43  import fr.paris.lutece.portal.service.util.AppException;
44  import fr.paris.lutece.portal.service.util.AppLogService;
45  import fr.paris.lutece.portal.service.util.AppPropertiesService;
46  
47  import org.apache.lucene.document.Field;
48  import org.apache.lucene.document.FieldType;
49  import org.apache.lucene.document.StringField;
50  import org.apache.lucene.document.TextField;
51  
52  import org.apache.tika.exception.TikaException;
53  import org.apache.tika.metadata.Metadata;
54  import org.apache.tika.parser.ParseContext;
55  import org.apache.tika.parser.html.HtmlParser;
56  import org.apache.tika.sax.BodyContentHandler;
57  
58  import org.xml.sax.ContentHandler;
59  import org.xml.sax.SAXException;
60  
61  import java.io.ByteArrayInputStream;
62  import java.io.IOException;
63  
64  import java.text.DateFormat;
65  
66  import java.util.ArrayList;
67  import java.util.Collection;
68  import java.util.List;
69  
70  
71  /**
72   * DefaultDocSearchIndexer
73   */
74  public class DefaultDocSearchIndexer implements IDocSearchIndexer
75  {
76  
77      private static final String PROPERTY_WRITER_MAX_FIELD_LENGTH = "search.lucene.writer.maxFieldLength"; // from the core
78      private static final int DEFAULT_WRITER_MAX_FIELD_LENGTH = 1000000;
79  
80      /**
81       * Build Lucene docs to index
82       * @param listDocumentIds Documents to index
83       * @return A list of Lucene documents
84       * @throws IOException i/o exception
85       */
86      public List<org.apache.lucene.document.Document> getDocuments( Collection<Integer> listDocumentIds )
87          throws IOException
88      {
89          List<org.apache.lucene.document.Document> listLuceneDocs = new ArrayList<org.apache.lucene.document.Document>(  );
90  
91          for ( Integer documentId : listDocumentIds )
92          {
93              Document document = DocumentHome.findByPrimaryKey( documentId );
94  
95              if ( document != null )
96              {
97                  listLuceneDocs.add( getDocument( document ) );
98              }
99          }
100 
101         return listLuceneDocs;
102     }
103 
104     /**
105      * Return the document
106      * @param document Documents object
107      * @return document
108      * @throws IOException i/o exception
109      */
110     private org.apache.lucene.document.Document getDocument( Document document )
111         throws IOException
112     {
113         // make a new, empty Lucene document
114         org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(  );
115 
116         // Add the last modified date of the file a field named "modified".
117         // Use a field that is indexed (i.e. searchable), but don't tokenize
118         // the field into words.
119         FieldType ft = new FieldType( StringField.TYPE_STORED );
120         ft.setOmitNorms( false );
121 
122         DateFormat formater = DateFormat.getDateInstance( DateFormat.SHORT );
123         String strDate = formater.format( document.getDateModification(  ) );
124         doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
125 
126         // Add the uid as a field, so that index can be incrementally maintained.
127         // This field is stored with document, it is indexed, but it is not
128         // tokenized prior to indexing.
129         String strIdDocument = String.valueOf( document.getId(  ) );
130         doc.add( new Field( SearchItem.FIELD_UID, strIdDocument, ft ) );
131 
132         String strContentToIndex = getContentToIndex( document );
133         int nWriteLimit = AppPropertiesService.getPropertyInt( PROPERTY_WRITER_MAX_FIELD_LENGTH, DEFAULT_WRITER_MAX_FIELD_LENGTH );
134         ContentHandler handler = new BodyContentHandler( nWriteLimit );
135         Metadata metadata = new Metadata(  );
136 
137         try
138         {
139             new HtmlParser(  ).parse( new ByteArrayInputStream( strContentToIndex.getBytes(  ) ), handler, metadata,
140                 new ParseContext(  ) );
141         }
142         catch ( SAXException e )
143         {
144             throw new AppException( "Error during document parsing.", e );
145         }
146         catch ( TikaException e )
147         {
148             throw new AppException( "Error during document parsing.", e );
149         }
150 
151         //the content of the article is recovered in the parser because this one
152         //had replaced the encoded caracters (as &eacute;) by the corresponding special caracter (as ?)
153         StringBuilder sb = new StringBuilder( handler.toString(  ) );
154 
155         // Add the tag-stripped contents as a Reader-valued Text field so it will
156         // get tokenized and indexed.
157         doc.add( new Field( SearchItem.FIELD_CONTENTS, sb.toString(  ), TextField.TYPE_NOT_STORED ) );
158 
159         // Add the title as a separate Text field, so that it can be searched
160         // separately.
161         FieldType ft2 = new FieldType( TextField.TYPE_STORED );
162         ft2.setOmitNorms( true );
163         doc.add( new Field( SearchItem.FIELD_TITLE, document.getTitle(  ), ft2 ) );
164         doc.add( new Field( DocSearchItem.FIELD_SUMMARY, document.getSummary(  ), ft2 ) );
165 
166         doc.add( new Field( SearchItem.FIELD_TYPE, document.getType(  ), ft ) );
167         doc.add( new Field( DocSearchItem.FIELD_SPACE, "s" + document.getSpaceId(  ), ft2 ) );
168 
169         // return the document
170         return doc;
171     }
172 
173     /**
174      * Return the content
175      * @param document Document object
176      * @return content
177      */
178     private static String getContentToIndex( Document document )
179     {
180         StringBuilder sbContentToIndex = new StringBuilder(  );
181         sbContentToIndex.append( document.getTitle(  ) );
182         sbContentToIndex.append( " " );
183         sbContentToIndex.append( document.getSummary(  ) );
184         sbContentToIndex.append( " " );
185 
186         for ( DocumentAttribute attribute : document.getAttributes(  ) )
187         {
188             if ( attribute.isSearchable(  ) )
189             {
190                 if ( !attribute.isBinary(  ) )
191                 {
192                     sbContentToIndex.append( attribute.getTextValue(  ) );
193                     sbContentToIndex.append( " " );
194                 }
195                 else
196                 {
197                     IFileIndexerFactory factoryIndexer = (IFileIndexerFactory) SpringContextService.getBean( IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY );
198                     IFileIndexer indexer = factoryIndexer.getIndexer( attribute.getValueContentType(  ) );
199 
200                     if ( indexer != null )
201                     {
202                         try
203                         {
204                             ByteArrayInputStream bais = new ByteArrayInputStream( attribute.getBinaryValue(  ) );
205                             sbContentToIndex.append( indexer.getContentToIndex( bais ) );
206                             sbContentToIndex.append( " " );
207                             bais.close(  );
208                         }
209                         catch ( IOException e )
210                         {
211                             AppLogService.error( e.getMessage(  ), e );
212                         }
213                     }
214                 }
215             }
216         }
217 
218         // Add metadata in XML (xml tags will be ignored by the HTML parsing)
219         sbContentToIndex.append( document.getXmlMetadata(  ) );
220 
221         return sbContentToIndex.toString(  );
222     }
223 }