View Javadoc
1   /*
2    * Copyright (c) 2002-2020, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.nlptools;
35  
36  import java.io.IOException;
37  import java.io.InputStream;
38  import java.text.MessageFormat;
39  import java.util.ArrayList;
40  import java.util.List;
41  import opennlp.tools.namefind.NameFinderME;
42  import opennlp.tools.namefind.TokenNameFinderModel;
43  import opennlp.tools.tokenize.TokenizerME;
44  import opennlp.tools.tokenize.TokenizerModel;
45  import opennlp.tools.util.Span;
46  
47  /**
48   * PersonNameFinder
49   */
50  public class PersonNameFinder extends AbstractFinder
51  {
52  
53      private static final String TOKEN_DEFAULT_MODEL = "/fr/paris/lutece/nlptools/models/{0}-token.bin";
54      private static final String NAME_FINDER_DEFAULT_MODEL = "/fr/paris/lutece/nlptools/models/{0}-ner-person.bin";
55      private static final int MODULO = 1000;
56  
57      private static String[] _ignoredNames =
58      {
59          "Les", "Merci", "R", "Il", "Elle", "Dans", "Pour", "De", "Bien", "Monsieur", "Votre", "Cordialement", "Et", "Ils", "Nous", "Vous", "Sur", "Mes", "Je", "Ainsi", "Ville", "J'ai" , "En", "Alors"
60      };
61  
62      private static String _strTokenModel = TOKEN_DEFAULT_MODEL;
63      private static TokenizerME _tokenizer;
64  
65      private String _strNameFinderModel = NAME_FINDER_DEFAULT_MODEL;
66      private NameFinderME _nameFinder;
67  
68      private boolean _bInit;
69  
70      /**
71       * Constructor
72       */
73      public PersonNameFinder()
74      {
75          super();
76      }
77  
78      /**
79       * Constructor
80       *
81       * @param strReplacement The replacement
82       */
83      public PersonNameFinder(String strReplacement)
84      {
85          super(strReplacement);
86      }
87  
88      /**
89       * Constructor
90       *
91       * @param strReplacement The replacement
92       */
93      public PersonNameFinder(String strReplacement, String strLanguage)
94      {
95          super(strReplacement, strLanguage);
96      }
97  
98      /**
99       * @return the Model
100      */
101     public String getNameModel()
102     {
103         return _strNameFinderModel;
104     }
105 
106     /**
107      * @param strModel the Model to set
108      */
109     public void setNameModel(String strModel)
110     {
111         _strNameFinderModel = strModel;
112     }
113 
114     /**
115      * @return the Model
116      */
117     public static String getTokenModel()
118     {
119         return _strTokenModel;
120     }
121 
122     /**
123      * @param strModel the Model to set
124      */
125     public static void setTokenModel(String strModel)
126     {
127         _strTokenModel = strModel;
128     }
129 
130     /**
131      * {@inheritDoc }
132      */
133     @Override
134     public List<String> findOccurrences(String strInput) throws FinderException
135     {
136         if (!_bInit)
137         {
138             init();
139         }
140         else
141         {
142             _nameFinder.clearAdaptiveData();
143         }
144 
145         String strCleanedInput = clean(strInput);
146 
147         List<String> listNames = new ArrayList<>();
148         String[] sentences = strCleanedInput.split("\"");
149         int nTotal = sentences.length;
150         System.out.println("NameFinder - number of text bloc to process: " + nTotal );
151         int nCount = 0;
152         int nPercent = 0;
153         for (String strSentece : sentences)
154         {
155             String[] tokens = _tokenizer.tokenize(strSentece);
156             Span nameSpans[] = _nameFinder.find(tokens);
157             for (Span span : nameSpans)
158             {
159                 StringBuilder sbName = new StringBuilder();
160                 for (int i = span.getStart(); i < span.getEnd(); i++)
161                 {
162                     if (i > span.getStart())
163                     {
164                         sbName.append(" ");
165                     }
166                     sbName.append(tokens[i]);
167                 }
168                 String strEntity = sbName.toString();
169                 if (isValidPersonName(strEntity))
170                 {
171                     listNames.add(strEntity);
172                     addEntity(strEntity);
173                 }
174             }
175             nCount++;
176             int nNewPercent = 100 * nCount / nTotal;
177             if ( nNewPercent != nPercent )
178             {
179                 nPercent = nNewPercent;
180                 System.out.println("NameFinder - number of text bloc processed: " + nCount + " - (" + nPercent + "%)" );
181             }
182         }
183         return listNames;
184 
185     }
186 
187     /**
188      * {@inheritDoc }
189      */
190     @Override
191     public String replaceOccurrences(String strInputText) throws FinderException
192     {
193         return replaceOccurrences(strInputText, getReplacement());
194     }
195 
196     /**
197      * {@inheritDoc }
198      */
199     @Override
200     public String replaceOccurrences(String strInput, String strReplacement) throws FinderException
201     {
202         if (!_bInit)
203         {
204             init();
205         }
206         else
207         {
208             _nameFinder.clearAdaptiveData();
209         }
210 
211         String strCleanedInput = clean(strInput);
212         String[] sentences = strCleanedInput.split("\"");
213         int nTotal = sentences.length;
214         System.out.println("NameFinder - number of text bloc to process: " + nTotal );
215         StringBuilder sbOutput = new StringBuilder();
216         int nCount = 0;
217         int nPercent = 0;
218         for (String strSentece : sentences)
219         {
220             sbOutput.append('"');
221             String[] tokens = _tokenizer.tokenize(strSentece);
222             String[] output = new String[tokens.length];
223             Span nameSpans[] = _nameFinder.find(tokens);
224             int i = 0;
225             int j = 0;
226             while (true)
227             {
228 
229                 for (Span span : nameSpans)
230                 {
231                     if (i == span.getStart())
232                     {
233                         i = span.getEnd();
234 
235                         StringBuilder sbName = new StringBuilder();
236                         for (int k = span.getStart(); k < span.getEnd(); k++)
237                         {
238                             if (k > span.getStart())
239                             {
240                                 sbName.append(" ");
241                             }
242                             sbName.append(tokens[k]);
243                         }
244                         String strEntity = sbName.toString();
245                         if (isValidPersonName(strEntity))
246                         {
247                             output[j++] = strReplacement;
248                         }
249                         else
250                         {
251                             output[j++] = strEntity;
252                         }
253                     }
254                 }
255 
256                 if (i < tokens.length && j < output.length)
257                 {
258                     output[j++] = tokens[i++];
259                 }
260                 else
261                 {
262                     break;
263                 }
264             }
265 
266             for (String strWord : output)
267             {
268                 if (strWord != null)
269                 {
270                     sbOutput.append(strWord).append(" ");
271                 }
272             }
273             nCount++;
274             int nNewPercent = 100 * nCount / nTotal;
275             if ( nNewPercent != nPercent )
276             {
277                 nPercent = nNewPercent;
278                 System.out.println("NameFinder - number of text bloc processed: " + nCount + " - (" + nPercent + "%)" );
279             }
280         }
281         sbOutput.append('"');
282 
283         System.out.println("NameFinder - number of text bloc processed: " + nCount);
284         
285         String strOutput = sbOutput.toString();
286 
287         strOutput = strOutput.replaceAll( "# # # # # # # # " , "\n" );
288         strOutput = strOutput.replaceAll( "# # # # # # # #" , "" );
289         strOutput = strOutput.replaceAll( "# # # # # # ##" , "" );
290         strOutput = strOutput.replaceAll( "## # # # # ##" , "" );
291         strOutput = strOutput.replaceAll( "## # # # # # #" , "" );
292         strOutput = strOutput.replaceAll( "#\n# # # # # ##" , "" );
293 //        strOutput = strOutput.replaceAll( " # # # # ##" , "" );
294         
295         return strOutput;
296     }
297 
298     /**
299      * Initialize the finder by loading models
300      *
301      * @throws FinderException
302      */
303     private void init() throws FinderException
304     {
305         String strTokenModel = MessageFormat.format(_strTokenModel, getLanguage());
306         String strNameFinderModel = MessageFormat.format(_strNameFinderModel, getLanguage());
307         try (InputStream isTokenModel = PersonNameFinder.class.getResourceAsStream(strTokenModel);
308                 InputStream isNameFinderModel = PersonNameFinder.class.getResourceAsStream(strNameFinderModel))
309         {
310             TokenizerModel tm = new TokenizerModel(isTokenModel);
311             _tokenizer = new TokenizerME(tm);
312             TokenNameFinderModel model = new TokenNameFinderModel(isNameFinderModel);
313             _nameFinder = new NameFinderME(model);
314             _bInit = true;
315         }
316         catch (IOException ex)
317         {
318             throw new FinderException("Error loading model : " + ex.getMessage(), ex);
319         }
320 
321     }
322 
323     private String clean(String strInput)
324     {
325         String strClean = strInput.replaceAll("_x000D_", "");
326         strClean = strClean.replaceAll( "\n", "########" );
327         
328         return strClean;
329     }
330 
331     private boolean isValidPersonName(String strInput)
332     {
333         for (String strIgnore : _ignoredNames)
334         {
335             if (strInput.equals(strIgnore))
336             {
337                 return false;
338             }
339         }
340         return true;
341 
342     }
343 }