View Javadoc

1   /*
2    * Copyright (c) 2002-2014, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.analyzer.service;
35  
36  import org.apache.lucene.analysis.ASCIIFoldingFilter;
37  import org.apache.lucene.analysis.Analyzer;
38  import org.apache.lucene.analysis.LowerCaseFilter;
39  import org.apache.lucene.analysis.StopFilter;
40  import org.apache.lucene.analysis.TokenStream;
41  import org.apache.lucene.analysis.Tokenizer;
42  import org.apache.lucene.analysis.WordlistLoader;
43  import org.apache.lucene.analysis.fr.ElisionFilter;
44  import org.apache.lucene.analysis.fr.FrenchAnalyzer; // javadocs only
45  import org.apache.lucene.analysis.fr.FrenchStemFilter;
46  import org.apache.lucene.analysis.standard.StandardFilter;
47  import org.apache.lucene.analysis.standard.StandardTokenizer;
48  import org.apache.lucene.util.Version;
49  
50  import java.io.File;
51  import java.io.IOException;
52  import java.io.Reader;
53  
54  import java.util.HashSet;
55  import java.util.Map;
56  import java.util.Set;
57  
58  
59  /**
60   *
61   * {@link FrenchAnalyzer}, adding {@link ElisionFilter} and {@link ASCIIFoldingFilter}.
62   * <br>
63   * <ol>
64   * 	<li>{@link StandardTokenizer}</li>
65   *  <li>{@link ElisionFilter}</li>
66   *  <li>{@link StandardFilter}</li>
67   *  <li>{@link StopFilter}</li>
68   *  <li>{@link ASCIIFoldingFilter}</li>
69   *  <li>{@link FrenchStemFilter}</li>
70   *  <li>{@link LowerCaseFilter}</li>
71   * </ol>
72   * @see FrenchAnalyzer
73   * @see ElisionFilter
74   * @see ASCIIFoldingFilter
75   * @see Analyzer
76   *
77   */
78  public class LuteceFrenchAnalyzer extends Analyzer
79  {
80      /**
81       * Extended list of typical French stopwords.
82       */
83      public static final String[] FRENCH_STOP_WORDS = 
84          {
85              "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi", "autre", "autres",
86              "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", "c", "car", "ce", "ceci", "cela",
87              "celle", "celles", "celui", "cependant", "certain", "certaine", "certaines", "certains", "ces", "cet",
88              "cette", "ceux", "chez", "ci", "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de",
89              "debout", "dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles", "desquels",
90              "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse", "diverses", "doit", "donc", "dont",
91              "du", "duquel", "durant", "dès", "elle", "elles", "en", "entre", "environ", "est", "et", "etc", "etre",
92              "eu", "eux", "excepté", "hormis", "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l",
93              "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
94              "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi", "moins", "mon",
95              "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre", "nous", "néanmoins", "nôtre",
96              "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi", "partant", "pas", "passé", "pendant", "plein",
97              "plus", "plusieurs", "pour", "pourquoi", "proche", "près", "puisque", "qu", "quand", "que", "quel",
98              "quelle", "quelles", "quels", "qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf",
99              "se", "selon", "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit", "son",
100             "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes", "tiens", "toi", "ton",
101             "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers", "voici", "voilà", "vos", "votre",
102             "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès", "été", "être", "ô",
103         };
104 
105     /**
106      * Contains the stopwords used with the {@link StopFilter}.
107      */
108     @SuppressWarnings( "unchecked" )
109     private Set _stoptable = new HashSet(  );
110 
111     /**
112      * Contains words that should be indexed but not stemmed.
113      */
114     @SuppressWarnings( "unchecked" )
115     private Set _excltable = new HashSet(  );
116     private final Version _matchVersion;
117 
118     /**
119      * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
120      *
121      * @deprecated Use {@link #LuteceFrenchAnalyzer(Version)} instead.
122      */
123     public LuteceFrenchAnalyzer(  )
124     {
125         this( Version.LUCENE_23 );
126     }
127 
128     /**
129      * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
130      * @param matchVersion the version
131      */
132     public LuteceFrenchAnalyzer( Version matchVersion )
133     {
134         _stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );
135         this._matchVersion = matchVersion;
136     }
137 
138     /**
139      * Builds an analyzer with the given stop words.
140      * @param stopwords the stop words
141      * @deprecated Use {@link #LuteceFrenchAnalyzer(Version,
142      * String[])} instead.
143      */
144     public LuteceFrenchAnalyzer( String[] stopwords )
145     {
146         this( Version.LUCENE_23, stopwords );
147     }
148 
149     /**
150      * Builds an analyzer with the given stop words.
151      * @param matchVersion the version
152      * @param stopwords the stop words
153      */
154     public LuteceFrenchAnalyzer( Version matchVersion, String[] stopwords )
155     {
156         _stoptable = StopFilter.makeStopSet( stopwords );
157         this._matchVersion = matchVersion;
158     }
159 
160     /**
161      * Builds an analyzer with the given stop words.
162      * @param stopwords the stop words file
163      * @throws IOException io exception
164      *
165      * @deprecated Use {@link #LuteceFrenchAnalyzer(Version, File)} instead
166      */
167     public LuteceFrenchAnalyzer( File stopwords ) throws IOException
168     {
169         this( Version.LUCENE_23, stopwords );
170     }
171 
172     /**
173      * Builds an analyzer with the given stop words.
174      * @param matchVersion the version
175      * @param stopwords the stop words
176      * @throws IOException io exception
177      */
178     @SuppressWarnings( "unchecked" )
179     public LuteceFrenchAnalyzer( Version matchVersion, File stopwords )
180         throws IOException
181     {
182         _stoptable = new HashSet( WordlistLoader.getWordSet( stopwords ) );
183         this._matchVersion = matchVersion;
184     }
185 
186     /**
187      * Builds an exclusionlist from an array of Strings.
188      * @param exclusionlist the non "stemmable" words
189      */
190     public void setStemExclusionTable( String[] exclusionlist )
191     {
192         _excltable = StopFilter.makeStopSet( exclusionlist );
193         setPreviousTokenStream( null ); // force a new stemmer to be created
194     }
195 
196     /**
197      * Builds an exclusionlist from a Map.
198      * @param exclusionlist the non "stemmable" words
199      */
200     @SuppressWarnings( "unchecked" )
201     public void setStemExclusionTable( Map exclusionlist )
202     {
203         _excltable = new HashSet( exclusionlist.keySet(  ) );
204         setPreviousTokenStream( null ); // force a new stemmer to be created
205     }
206 
207     /**
208      * Builds an exclusionlist from the words contained in the given file.
209      * @param exclusionlist file containing non stemmable words
210      * @throws IOException io exception
211      */
212     @SuppressWarnings( "unchecked" )
213     public void setStemExclusionTable( File exclusionlist )
214         throws IOException
215     {
216         _excltable = new HashSet( WordlistLoader.getWordSet( exclusionlist ) );
217         setPreviousTokenStream( null ); // force a new stemmer to be created
218     }
219 
220     /**
221      * 
222      * {@inheritDoc}
223      */
224     public final TokenStream tokenStream( String fieldName, Reader reader )
225     {
226         if ( fieldName == null )
227         {
228             throw new IllegalArgumentException( "fieldName must not be null" );
229         }
230 
231         if ( reader == null )
232         {
233             throw new IllegalArgumentException( "reader must not be null" );
234         }
235 
236         TokenStream result = new StandardTokenizer( _matchVersion, reader );
237 
238         result = new ElisionFilter( result, _stoptable );
239 
240         result = new StandardFilter( result );
241         result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault( _matchVersion ), result, _stoptable );
242         result = new ASCIIFoldingFilter( result );
243         result = new FrenchStemFilter( result, _excltable );
244         // Convert to lowercase after stemming!
245         result = new LowerCaseFilter( result );
246 
247         return result;
248     }
249 
250     /**
251      * 
252      * {@inheritDoc}
253      */
254     public TokenStream reusableTokenStream( String fieldName, Reader reader )
255         throws IOException
256     {
257         SavedStreams streams = (SavedStreams) getPreviousTokenStream(  );
258 
259         if ( streams == null )
260         {
261             streams = new SavedStreams(  );
262             streams._source = new StandardTokenizer( _matchVersion, reader );
263             streams._result = new StandardFilter( streams._source );
264 
265             streams._result = new ElisionFilter( streams._result, _stoptable );
266 
267             streams._result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault( _matchVersion ),
268                     streams._result, _stoptable );
269 
270             streams._result = new ASCIIFoldingFilter( streams._result );
271             streams._result = new FrenchStemFilter( streams._result, _excltable );
272 
273             // Convert to lowercase after stemming!
274             streams._result = new LowerCaseFilter( streams._result );
275             setPreviousTokenStream( streams );
276         }
277         else
278         {
279             streams._source.reset( reader );
280         }
281 
282         return streams._result;
283     }
284 
285     /**
286      * Private class
287      * @see {@link Analyzer#getPreviousTokenStream}
288      *
289      */
290     private class SavedStreams
291     {
292         Tokenizer _source;
293         TokenStream _result;
294     }
295 }