1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.plugins.analyzer.service;
35
36 import org.apache.lucene.analysis.ASCIIFoldingFilter;
37 import org.apache.lucene.analysis.Analyzer;
38 import org.apache.lucene.analysis.LowerCaseFilter;
39 import org.apache.lucene.analysis.StopFilter;
40 import org.apache.lucene.analysis.TokenStream;
41 import org.apache.lucene.analysis.Tokenizer;
42 import org.apache.lucene.analysis.WordlistLoader;
43 import org.apache.lucene.analysis.fr.ElisionFilter;
44 import org.apache.lucene.analysis.fr.FrenchAnalyzer;
45 import org.apache.lucene.analysis.fr.FrenchStemFilter;
46 import org.apache.lucene.analysis.standard.StandardFilter;
47 import org.apache.lucene.analysis.standard.StandardTokenizer;
48 import org.apache.lucene.util.Version;
49
50 import java.io.File;
51 import java.io.IOException;
52 import java.io.Reader;
53
54 import java.util.HashSet;
55 import java.util.Map;
56 import java.util.Set;
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78 public class LuteceFrenchAnalyzer extends Analyzer
79 {
80
81
82
83 public static final String[] FRENCH_STOP_WORDS =
84 {
85 "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi", "autre", "autres",
86 "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", "c", "car", "ce", "ceci", "cela",
87 "celle", "celles", "celui", "cependant", "certain", "certaine", "certaines", "certains", "ces", "cet",
88 "cette", "ceux", "chez", "ci", "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de",
89 "debout", "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles", "desquels",
90 "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse", "diverses", "doit", "donc", "dont",
91 "du", "duquel", "durant", "dès", "elle", "elles", "en", "entre", "environ", "est", "et", "etc", "etre",
92 "eu", "eux", "excepté", "hormis", "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l",
93 "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
94 "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi", "moins", "mon",
95 "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre", "nous", "néanmoins", "nôtre",
96 "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi", "partant", "pas", "passé", "pendant", "plein",
97 "plus", "plusieurs", "pour", "pourquoi", "proche", "près", "puisque", "qu", "quand", "que", "quel",
98 "quelle", "quelles", "quels", "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf",
99 "se", "selon", "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit", "son",
100 "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes", "tiens", "toi", "ton",
101 "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers", "voici", "voilà", "vos", "votre",
102 "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès", "été", "être", "ô",
103 };
104
105
106
107
108 @SuppressWarnings( "unchecked" )
109 private Set _stoptable = new HashSet( );
110
111
112
113
114 @SuppressWarnings( "unchecked" )
115 private Set _excltable = new HashSet( );
116 private final Version _matchVersion;
117
118
119
120
121
122
123 public LuteceFrenchAnalyzer( )
124 {
125 this( Version.LUCENE_23 );
126 }
127
128
129
130
131
132 public LuteceFrenchAnalyzer( Version matchVersion )
133 {
134 _stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );
135 this._matchVersion = matchVersion;
136 }
137
138
139
140
141
142
143
144 public LuteceFrenchAnalyzer( String[] stopwords )
145 {
146 this( Version.LUCENE_23, stopwords );
147 }
148
149
150
151
152
153
154 public LuteceFrenchAnalyzer( Version matchVersion, String[] stopwords )
155 {
156 _stoptable = StopFilter.makeStopSet( stopwords );
157 this._matchVersion = matchVersion;
158 }
159
160
161
162
163
164
165
166
167 public LuteceFrenchAnalyzer( File stopwords ) throws IOException
168 {
169 this( Version.LUCENE_23, stopwords );
170 }
171
172
173
174
175
176
177
178 @SuppressWarnings( "unchecked" )
179 public LuteceFrenchAnalyzer( Version matchVersion, File stopwords )
180 throws IOException
181 {
182 _stoptable = new HashSet( WordlistLoader.getWordSet( stopwords ) );
183 this._matchVersion = matchVersion;
184 }
185
186
187
188
189
190 public void setStemExclusionTable( String[] exclusionlist )
191 {
192 _excltable = StopFilter.makeStopSet( exclusionlist );
193 setPreviousTokenStream( null );
194 }
195
196
197
198
199
200 @SuppressWarnings( "unchecked" )
201 public void setStemExclusionTable( Map exclusionlist )
202 {
203 _excltable = new HashSet( exclusionlist.keySet( ) );
204 setPreviousTokenStream( null );
205 }
206
207
208
209
210
211
212 @SuppressWarnings( "unchecked" )
213 public void setStemExclusionTable( File exclusionlist )
214 throws IOException
215 {
216 _excltable = new HashSet( WordlistLoader.getWordSet( exclusionlist ) );
217 setPreviousTokenStream( null );
218 }
219
220
221
222
223
224 public final TokenStream tokenStream( String fieldName, Reader reader )
225 {
226 if ( fieldName == null )
227 {
228 throw new IllegalArgumentException( "fieldName must not be null" );
229 }
230
231 if ( reader == null )
232 {
233 throw new IllegalArgumentException( "reader must not be null" );
234 }
235
236 TokenStream result = new StandardTokenizer( _matchVersion, reader );
237
238 result = new ElisionFilter( result, _stoptable );
239
240 result = new StandardFilter( result );
241 result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault( _matchVersion ), result, _stoptable );
242 result = new ASCIIFoldingFilter( result );
243 result = new FrenchStemFilter( result, _excltable );
244
245 result = new LowerCaseFilter( result );
246
247 return result;
248 }
249
250
251
252
253
254 public TokenStream reusableTokenStream( String fieldName, Reader reader )
255 throws IOException
256 {
257 SavedStreams streams = (SavedStreams) getPreviousTokenStream( );
258
259 if ( streams == null )
260 {
261 streams = new SavedStreams( );
262 streams._source = new StandardTokenizer( _matchVersion, reader );
263 streams._result = new StandardFilter( streams._source );
264
265 streams._result = new ElisionFilter( streams._result, _stoptable );
266
267 streams._result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault( _matchVersion ),
268 streams._result, _stoptable );
269
270 streams._result = new ASCIIFoldingFilter( streams._result );
271 streams._result = new FrenchStemFilter( streams._result, _excltable );
272
273
274 streams._result = new LowerCaseFilter( streams._result );
275 setPreviousTokenStream( streams );
276 }
277 else
278 {
279 streams._source.reset( reader );
280 }
281
282 return streams._result;
283 }
284
285
286
287
288
289
290 private class SavedStreams
291 {
292 Tokenizer _source;
293 TokenStream _result;
294 }
295 }