1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.plugins.document.service.docsearch;
35
36 import fr.paris.lutece.plugins.document.business.Document;
37 import fr.paris.lutece.plugins.document.business.DocumentHome;
38 import fr.paris.lutece.plugins.document.business.attributes.DocumentAttribute;
39 import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexer;
40 import fr.paris.lutece.plugins.lucene.service.indexer.IFileIndexerFactory;
41 import fr.paris.lutece.portal.service.search.SearchItem;
42 import fr.paris.lutece.portal.service.spring.SpringContextService;
43 import fr.paris.lutece.portal.service.util.AppException;
44 import fr.paris.lutece.portal.service.util.AppLogService;
45 import fr.paris.lutece.portal.service.util.AppPropertiesService;
46
47 import org.apache.lucene.document.Field;
48 import org.apache.lucene.document.FieldType;
49 import org.apache.lucene.document.StringField;
50 import org.apache.lucene.document.TextField;
51
52 import org.apache.tika.exception.TikaException;
53 import org.apache.tika.metadata.Metadata;
54 import org.apache.tika.parser.ParseContext;
55 import org.apache.tika.parser.html.HtmlParser;
56 import org.apache.tika.sax.BodyContentHandler;
57
58 import org.xml.sax.ContentHandler;
59 import org.xml.sax.SAXException;
60
61 import java.io.ByteArrayInputStream;
62 import java.io.IOException;
63
64 import java.text.DateFormat;
65
66 import java.util.ArrayList;
67 import java.util.Collection;
68 import java.util.List;
69
70
71
72
73
74 public class DefaultDocSearchIndexer implements IDocSearchIndexer
75 {
76
77 private static final String PROPERTY_WRITER_MAX_FIELD_LENGTH = "search.lucene.writer.maxFieldLength";
78 private static final int DEFAULT_WRITER_MAX_FIELD_LENGTH = 1000000;
79
80
81
82
83
84
85
86 public List<org.apache.lucene.document.Document> getDocuments( Collection<Integer> listDocumentIds )
87 throws IOException
88 {
89 List<org.apache.lucene.document.Document> listLuceneDocs = new ArrayList<org.apache.lucene.document.Document>( );
90
91 for ( Integer documentId : listDocumentIds )
92 {
93 Document document = DocumentHome.findByPrimaryKey( documentId );
94
95 if ( document != null )
96 {
97 listLuceneDocs.add( getDocument( document ) );
98 }
99 }
100
101 return listLuceneDocs;
102 }
103
104
105
106
107
108
109
110 private org.apache.lucene.document.Document getDocument( Document document )
111 throws IOException
112 {
113
114 org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document( );
115
116
117
118
119 FieldType ft = new FieldType( StringField.TYPE_STORED );
120 ft.setOmitNorms( false );
121
122 DateFormat formater = DateFormat.getDateInstance( DateFormat.SHORT );
123 String strDate = formater.format( document.getDateModification( ) );
124 doc.add( new Field( SearchItem.FIELD_DATE, strDate, ft ) );
125
126
127
128
129 String strIdDocument = String.valueOf( document.getId( ) );
130 doc.add( new Field( SearchItem.FIELD_UID, strIdDocument, ft ) );
131
132 String strContentToIndex = getContentToIndex( document );
133 int nWriteLimit = AppPropertiesService.getPropertyInt( PROPERTY_WRITER_MAX_FIELD_LENGTH, DEFAULT_WRITER_MAX_FIELD_LENGTH );
134 ContentHandler handler = new BodyContentHandler( nWriteLimit );
135 Metadata metadata = new Metadata( );
136
137 try
138 {
139 new HtmlParser( ).parse( new ByteArrayInputStream( strContentToIndex.getBytes( ) ), handler, metadata,
140 new ParseContext( ) );
141 }
142 catch ( SAXException e )
143 {
144 throw new AppException( "Error during document parsing.", e );
145 }
146 catch ( TikaException e )
147 {
148 throw new AppException( "Error during document parsing.", e );
149 }
150
151
152
153 StringBuilder sb = new StringBuilder( handler.toString( ) );
154
155
156
157 doc.add( new Field( SearchItem.FIELD_CONTENTS, sb.toString( ), TextField.TYPE_NOT_STORED ) );
158
159
160
161 FieldType ft2 = new FieldType( TextField.TYPE_STORED );
162 ft2.setOmitNorms( true );
163 doc.add( new Field( SearchItem.FIELD_TITLE, document.getTitle( ), ft2 ) );
164 doc.add( new Field( DocSearchItem.FIELD_SUMMARY, document.getSummary( ), ft2 ) );
165
166 doc.add( new Field( SearchItem.FIELD_TYPE, document.getType( ), ft ) );
167 doc.add( new Field( DocSearchItem.FIELD_SPACE, "s" + document.getSpaceId( ), ft2 ) );
168
169
170 return doc;
171 }
172
173
174
175
176
177
178 private static String getContentToIndex( Document document )
179 {
180 StringBuilder sbContentToIndex = new StringBuilder( );
181 sbContentToIndex.append( document.getTitle( ) );
182 sbContentToIndex.append( " " );
183 sbContentToIndex.append( document.getSummary( ) );
184 sbContentToIndex.append( " " );
185
186 for ( DocumentAttribute attribute : document.getAttributes( ) )
187 {
188 if ( attribute.isSearchable( ) )
189 {
190 if ( !attribute.isBinary( ) )
191 {
192 sbContentToIndex.append( attribute.getTextValue( ) );
193 sbContentToIndex.append( " " );
194 }
195 else
196 {
197 IFileIndexerFactory factoryIndexer = (IFileIndexerFactory) SpringContextService.getBean( IFileIndexerFactory.BEAN_FILE_INDEXER_FACTORY );
198 IFileIndexer indexer = factoryIndexer.getIndexer( attribute.getValueContentType( ) );
199
200 if ( indexer != null )
201 {
202 try
203 {
204 ByteArrayInputStream bais = new ByteArrayInputStream( attribute.getBinaryValue( ) );
205 sbContentToIndex.append( indexer.getContentToIndex( bais ) );
206 sbContentToIndex.append( " " );
207 bais.close( );
208 }
209 catch ( IOException e )
210 {
211 AppLogService.error( e.getMessage( ), e );
212 }
213 }
214 }
215 }
216 }
217
218
219 sbContentToIndex.append( document.getXmlMetadata( ) );
220
221 return sbContentToIndex.toString( );
222 }
223 }