1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.nlptools;
35
36 import java.io.IOException;
37 import java.io.InputStream;
38 import java.text.MessageFormat;
39 import java.util.ArrayList;
40 import java.util.List;
41 import opennlp.tools.namefind.NameFinderME;
42 import opennlp.tools.namefind.TokenNameFinderModel;
43 import opennlp.tools.tokenize.TokenizerME;
44 import opennlp.tools.tokenize.TokenizerModel;
45 import opennlp.tools.util.Span;
46
47
48
49
50 public class PersonNameFinder extends AbstractFinder
51 {
52
53 private static final String TOKEN_DEFAULT_MODEL = "/fr/paris/lutece/nlptools/models/{0}-token.bin";
54 private static final String NAME_FINDER_DEFAULT_MODEL = "/fr/paris/lutece/nlptools/models/{0}-ner-person.bin";
55 private static final int MODULO = 1000;
56
57 private static String[] _ignoredNames =
58 {
59 "Les", "Merci", "R", "Il", "Elle", "Dans", "Pour", "De", "Bien", "Monsieur", "Votre", "Cordialement", "Et", "Ils", "Nous", "Vous", "Sur", "Mes", "Je", "Ainsi", "Ville", "J'ai" , "En", "Alors"
60 };
61
62 private static String _strTokenModel = TOKEN_DEFAULT_MODEL;
63 private static TokenizerME _tokenizer;
64
65 private String _strNameFinderModel = NAME_FINDER_DEFAULT_MODEL;
66 private NameFinderME _nameFinder;
67
68 private boolean _bInit;
69
70
71
72
73 public PersonNameFinder()
74 {
75 super();
76 }
77
78
79
80
81
82
83 public PersonNameFinder(String strReplacement)
84 {
85 super(strReplacement);
86 }
87
88
89
90
91
92
93 public PersonNameFinder(String strReplacement, String strLanguage)
94 {
95 super(strReplacement, strLanguage);
96 }
97
98
99
100
101 public String getNameModel()
102 {
103 return _strNameFinderModel;
104 }
105
106
107
108
109 public void setNameModel(String strModel)
110 {
111 _strNameFinderModel = strModel;
112 }
113
114
115
116
117 public static String getTokenModel()
118 {
119 return _strTokenModel;
120 }
121
122
123
124
125 public static void setTokenModel(String strModel)
126 {
127 _strTokenModel = strModel;
128 }
129
130
131
132
133 @Override
134 public List<String> findOccurrences(String strInput) throws FinderException
135 {
136 if (!_bInit)
137 {
138 init();
139 }
140 else
141 {
142 _nameFinder.clearAdaptiveData();
143 }
144
145 String strCleanedInput = clean(strInput);
146
147 List<String> listNames = new ArrayList<>();
148 String[] sentences = strCleanedInput.split("\"");
149 int nTotal = sentences.length;
150 System.out.println("NameFinder - number of text bloc to process: " + nTotal );
151 int nCount = 0;
152 int nPercent = 0;
153 for (String strSentece : sentences)
154 {
155 String[] tokens = _tokenizer.tokenize(strSentece);
156 Span nameSpans[] = _nameFinder.find(tokens);
157 for (Span span : nameSpans)
158 {
159 StringBuilder sbName = new StringBuilder();
160 for (int i = span.getStart(); i < span.getEnd(); i++)
161 {
162 if (i > span.getStart())
163 {
164 sbName.append(" ");
165 }
166 sbName.append(tokens[i]);
167 }
168 String strEntity = sbName.toString();
169 if (isValidPersonName(strEntity))
170 {
171 listNames.add(strEntity);
172 addEntity(strEntity);
173 }
174 }
175 nCount++;
176 int nNewPercent = 100 * nCount / nTotal;
177 if ( nNewPercent != nPercent )
178 {
179 nPercent = nNewPercent;
180 System.out.println("NameFinder - number of text bloc processed: " + nCount + " - (" + nPercent + "%)" );
181 }
182 }
183 return listNames;
184
185 }
186
187
188
189
190 @Override
191 public String replaceOccurrences(String strInputText) throws FinderException
192 {
193 return replaceOccurrences(strInputText, getReplacement());
194 }
195
196
197
198
199 @Override
200 public String replaceOccurrences(String strInput, String strReplacement) throws FinderException
201 {
202 if (!_bInit)
203 {
204 init();
205 }
206 else
207 {
208 _nameFinder.clearAdaptiveData();
209 }
210
211 String strCleanedInput = clean(strInput);
212 String[] sentences = strCleanedInput.split("\"");
213 int nTotal = sentences.length;
214 System.out.println("NameFinder - number of text bloc to process: " + nTotal );
215 StringBuilder sbOutput = new StringBuilder();
216 int nCount = 0;
217 int nPercent = 0;
218 for (String strSentece : sentences)
219 {
220 sbOutput.append('"');
221 String[] tokens = _tokenizer.tokenize(strSentece);
222 String[] output = new String[tokens.length];
223 Span nameSpans[] = _nameFinder.find(tokens);
224 int i = 0;
225 int j = 0;
226 while (true)
227 {
228
229 for (Span span : nameSpans)
230 {
231 if (i == span.getStart())
232 {
233 i = span.getEnd();
234
235 StringBuilder sbName = new StringBuilder();
236 for (int k = span.getStart(); k < span.getEnd(); k++)
237 {
238 if (k > span.getStart())
239 {
240 sbName.append(" ");
241 }
242 sbName.append(tokens[k]);
243 }
244 String strEntity = sbName.toString();
245 if (isValidPersonName(strEntity))
246 {
247 output[j++] = strReplacement;
248 }
249 else
250 {
251 output[j++] = strEntity;
252 }
253 }
254 }
255
256 if (i < tokens.length && j < output.length)
257 {
258 output[j++] = tokens[i++];
259 }
260 else
261 {
262 break;
263 }
264 }
265
266 for (String strWord : output)
267 {
268 if (strWord != null)
269 {
270 sbOutput.append(strWord).append(" ");
271 }
272 }
273 nCount++;
274 int nNewPercent = 100 * nCount / nTotal;
275 if ( nNewPercent != nPercent )
276 {
277 nPercent = nNewPercent;
278 System.out.println("NameFinder - number of text bloc processed: " + nCount + " - (" + nPercent + "%)" );
279 }
280 }
281 sbOutput.append('"');
282
283 System.out.println("NameFinder - number of text bloc processed: " + nCount);
284
285 String strOutput = sbOutput.toString();
286
287 strOutput = strOutput.replaceAll( "# # # # # # # # " , "\n" );
288 strOutput = strOutput.replaceAll( "# # # # # # # #" , "" );
289 strOutput = strOutput.replaceAll( "# # # # # # ##" , "" );
290 strOutput = strOutput.replaceAll( "## # # # # ##" , "" );
291 strOutput = strOutput.replaceAll( "## # # # # # #" , "" );
292 strOutput = strOutput.replaceAll( "#\n# # # # # ##" , "" );
293
294
295 return strOutput;
296 }
297
298
299
300
301
302
303 private void init() throws FinderException
304 {
305 String strTokenModel = MessageFormat.format(_strTokenModel, getLanguage());
306 String strNameFinderModel = MessageFormat.format(_strNameFinderModel, getLanguage());
307 try (InputStream isTokenModel = PersonNameFinder.class.getResourceAsStream(strTokenModel);
308 InputStream isNameFinderModel = PersonNameFinder.class.getResourceAsStream(strNameFinderModel))
309 {
310 TokenizerModel tm = new TokenizerModel(isTokenModel);
311 _tokenizer = new TokenizerME(tm);
312 TokenNameFinderModel model = new TokenNameFinderModel(isNameFinderModel);
313 _nameFinder = new NameFinderME(model);
314 _bInit = true;
315 }
316 catch (IOException ex)
317 {
318 throw new FinderException("Error loading model : " + ex.getMessage(), ex);
319 }
320
321 }
322
323 private String clean(String strInput)
324 {
325 String strClean = strInput.replaceAll("_x000D_", "");
326 strClean = strClean.replaceAll( "\n", "########" );
327
328 return strClean;
329 }
330
331 private boolean isValidPersonName(String strInput)
332 {
333 for (String strIgnore : _ignoredNames)
334 {
335 if (strInput.equals(strIgnore))
336 {
337 return false;
338 }
339 }
340 return true;
341
342 }
343 }