PersonNameFinder.java

/*
 * Copyright (c) 2002-2020, Mairie de Paris
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice
 *     and the following disclaimer.
 *
 *  2. Redistributions in binary form must reproduce the above copyright notice
 *     and the following disclaimer in the documentation and/or other materials
 *     provided with the distribution.
 *
 *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
 *     contributors may be used to endorse or promote products derived from
 *     this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * License 1.0
 */
package fr.paris.lutece.nlptools;

import java.io.IOException;
import java.io.InputStream;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;

/**
 * PersonNameFinder
 */
public class PersonNameFinder extends AbstractFinder
{

    private static final String TOKEN_DEFAULT_MODEL = "/fr/paris/lutece/nlptools/models/{0}-token.bin";
    private static final String NAME_FINDER_DEFAULT_MODEL = "/fr/paris/lutece/nlptools/models/{0}-ner-person.bin";
    private static final int MODULO = 1000;

    private static String[] _ignoredNames =
    {
        "Les", "Merci", "R", "Il", "Elle", "Dans", "Pour", "De", "Bien", "Monsieur", "Votre", "Cordialement", "Et", "Ils", "Nous", "Vous", "Sur", "Mes", "Je", "Ainsi", "Ville", "J'ai" , "En", "Alors"
    };

    private static String _strTokenModel = TOKEN_DEFAULT_MODEL;
    private static TokenizerME _tokenizer;

    private String _strNameFinderModel = NAME_FINDER_DEFAULT_MODEL;
    private NameFinderME _nameFinder;

    private boolean _bInit;

    /**
     * Constructor
     */
    public PersonNameFinder()
    {
        super();
    }

    /**
     * Constructor
     *
     * @param strReplacement The replacement
     */
    public PersonNameFinder(String strReplacement)
    {
        super(strReplacement);
    }

    /**
     * Constructor
     *
     * @param strReplacement The replacement
     */
    public PersonNameFinder(String strReplacement, String strLanguage)
    {
        super(strReplacement, strLanguage);
    }

    /**
     * @return the Model
     */
    public String getNameModel()
    {
        return _strNameFinderModel;
    }

    /**
     * @param strModel the Model to set
     */
    public void setNameModel(String strModel)
    {
        _strNameFinderModel = strModel;
    }

    /**
     * @return the Model
     */
    public static String getTokenModel()
    {
        return _strTokenModel;
    }

    /**
     * @param strModel the Model to set
     */
    public static void setTokenModel(String strModel)
    {
        _strTokenModel = strModel;
    }

    /**
     * {@inheritDoc }
     */
    @Override
    public List<String> findOccurrences(String strInput) throws FinderException
    {
        if (!_bInit)
        {
            init();
        }
        else
        {
            _nameFinder.clearAdaptiveData();
        }

        String strCleanedInput = clean(strInput);

        List<String> listNames = new ArrayList<>();
        String[] sentences = strCleanedInput.split("\"");
        int nTotal = sentences.length;
        System.out.println("NameFinder - number of text bloc to process: " + nTotal );
        int nCount = 0;
        int nPercent = 0;
        for (String strSentece : sentences)
        {
            String[] tokens = _tokenizer.tokenize(strSentece);
            Span nameSpans[] = _nameFinder.find(tokens);
            for (Span span : nameSpans)
            {
                StringBuilder sbName = new StringBuilder();
                for (int i = span.getStart(); i < span.getEnd(); i++)
                {
                    if (i > span.getStart())
                    {
                        sbName.append(" ");
                    }
                    sbName.append(tokens[i]);
                }
                String strEntity = sbName.toString();
                if (isValidPersonName(strEntity))
                {
                    listNames.add(strEntity);
                    addEntity(strEntity);
                }
            }
            nCount++;
            int nNewPercent = 100 * nCount / nTotal;
            if ( nNewPercent != nPercent )
            {
                nPercent = nNewPercent;
                System.out.println("NameFinder - number of text bloc processed: " + nCount + " - (" + nPercent + "%)" );
            }
        }
        return listNames;

    }

    /**
     * {@inheritDoc }
     */
    @Override
    public String replaceOccurrences(String strInputText) throws FinderException
    {
        return replaceOccurrences(strInputText, getReplacement());
    }

    /**
     * {@inheritDoc }
     */
    @Override
    public String replaceOccurrences(String strInput, String strReplacement) throws FinderException
    {
        if (!_bInit)
        {
            init();
        }
        else
        {
            _nameFinder.clearAdaptiveData();
        }

        String strCleanedInput = clean(strInput);
        String[] sentences = strCleanedInput.split("\"");
        int nTotal = sentences.length;
        System.out.println("NameFinder - number of text bloc to process: " + nTotal );
        StringBuilder sbOutput = new StringBuilder();
        int nCount = 0;
        int nPercent = 0;
        for (String strSentece : sentences)
        {
            sbOutput.append('"');
            String[] tokens = _tokenizer.tokenize(strSentece);
            String[] output = new String[tokens.length];
            Span nameSpans[] = _nameFinder.find(tokens);
            int i = 0;
            int j = 0;
            while (true)
            {

                for (Span span : nameSpans)
                {
                    if (i == span.getStart())
                    {
                        i = span.getEnd();

                        StringBuilder sbName = new StringBuilder();
                        for (int k = span.getStart(); k < span.getEnd(); k++)
                        {
                            if (k > span.getStart())
                            {
                                sbName.append(" ");
                            }
                            sbName.append(tokens[k]);
                        }
                        String strEntity = sbName.toString();
                        if (isValidPersonName(strEntity))
                        {
                            output[j++] = strReplacement;
                        }
                        else
                        {
                            output[j++] = strEntity;
                        }
                    }
                }

                if (i < tokens.length && j < output.length)
                {
                    output[j++] = tokens[i++];
                }
                else
                {
                    break;
                }
            }

            for (String strWord : output)
            {
                if (strWord != null)
                {
                    sbOutput.append(strWord).append(" ");
                }
            }
            nCount++;
            int nNewPercent = 100 * nCount / nTotal;
            if ( nNewPercent != nPercent )
            {
                nPercent = nNewPercent;
                System.out.println("NameFinder - number of text bloc processed: " + nCount + " - (" + nPercent + "%)" );
            }
        }
        sbOutput.append('"');

        System.out.println("NameFinder - number of text bloc processed: " + nCount);
        
        String strOutput = sbOutput.toString();

        strOutput = strOutput.replaceAll( "# # # # # # # # " , "\n" );
        strOutput = strOutput.replaceAll( "# # # # # # # #" , "" );
        strOutput = strOutput.replaceAll( "# # # # # # ##" , "" );
        strOutput = strOutput.replaceAll( "## # # # # ##" , "" );
        strOutput = strOutput.replaceAll( "## # # # # # #" , "" );
        strOutput = strOutput.replaceAll( "#\n# # # # # ##" , "" );
//        strOutput = strOutput.replaceAll( " # # # # ##" , "" );
        
        return strOutput;
    }

    /**
     * Initialize the finder by loading models
     *
     * @throws FinderException
     */
    private void init() throws FinderException
    {
        String strTokenModel = MessageFormat.format(_strTokenModel, getLanguage());
        String strNameFinderModel = MessageFormat.format(_strNameFinderModel, getLanguage());
        try (InputStream isTokenModel = PersonNameFinder.class.getResourceAsStream(strTokenModel);
                InputStream isNameFinderModel = PersonNameFinder.class.getResourceAsStream(strNameFinderModel))
        {
            TokenizerModel tm = new TokenizerModel(isTokenModel);
            _tokenizer = new TokenizerME(tm);
            TokenNameFinderModel model = new TokenNameFinderModel(isNameFinderModel);
            _nameFinder = new NameFinderME(model);
            _bInit = true;
        }
        catch (IOException ex)
        {
            throw new FinderException("Error loading model : " + ex.getMessage(), ex);
        }

    }

    private String clean(String strInput)
    {
        String strClean = strInput.replaceAll("_x000D_", "");
        strClean = strClean.replaceAll( "\n", "########" );
        
        return strClean;
    }

    private boolean isValidPersonName(String strInput)
    {
        for (String strIgnore : _ignoredNames)
        {
            if (strInput.equals(strIgnore))
            {
                return false;
            }
        }
        return true;

    }
}