View Javadoc
1   /*
2    * Copyright (c) 2002-2017, Mairie de Paris
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions
7    * are met:
8    *
9    *  1. Redistributions of source code must retain the above copyright notice
10   *     and the following disclaimer.
11   *
12   *  2. Redistributions in binary form must reproduce the above copyright notice
13   *     and the following disclaimer in the documentation and/or other materials
14   *     provided with the distribution.
15   *
16   *  3. Neither the name of 'Mairie de Paris' nor 'Lutece' nor the names of its
17   *     contributors may be used to endorse or promote products derived from
18   *     this software without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
24   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   * POSSIBILITY OF SUCH DAMAGE.
31   *
32   * License 1.0
33   */
34  package fr.paris.lutece.plugins.comarquage.modules.solr.utils.parsers;
35  
36  import java.io.IOException;
37  import java.text.ParseException;
38  import java.text.SimpleDateFormat;
39  import java.util.ArrayList;
40  import java.util.Date;
41  import java.util.List;
42  import java.util.Locale;
43  
44  import javax.xml.parsers.ParserConfigurationException;
45  import javax.xml.parsers.SAXParser;
46  import javax.xml.parsers.SAXParserFactory;
47  
48  import org.xml.sax.Attributes;
49  import org.xml.sax.SAXException;
50  import org.xml.sax.helpers.DefaultHandler;
51  
52  import fr.paris.lutece.plugins.search.solr.indexer.SolrItem;
53  import fr.paris.lutece.plugins.search.solr.util.SolrConstants;
54  import fr.paris.lutece.portal.service.content.XPageAppService;
55  import fr.paris.lutece.portal.service.util.AppLogService;
56  import fr.paris.lutece.portal.service.util.AppPathService;
57  import fr.paris.lutece.portal.service.util.AppPropertiesService;
58  import fr.paris.lutece.util.url.UrlItem;
59  
60  
61  /**
62   * Parser for public cards (comarquage)
63   */
64  public class CoMarquageSolrPublicParser extends DefaultHandler
65  {
66      // -------------
67      // - Constants -
68      // -------------
69      // Plugin name
70      private static final String PROPERTY_PLUGIN_NAME = "comarquage.plugin.name";
71  
72      // CDC index keys
73      private static final String PROPERTY_INDEXING_XML_BASE_VAR = "comarquage.path.xml";
74      private static final String PROPERTY_INDEXING_FRAGMENT = "comarquage.indexing.";
75      private static final String PROPERTY_LIST_CDC_INDEX_KEYS_FRAGMENT = "listCdcIndexKeys";
76  
77      // XPath comparisons
78      private static final String PROPERTY_XPATH_CARD = "comarquage.parser.xpath.public.card";
79      private static final String PROPERTY_XPATH_URL = "comarquage.parser.xpath.public.url";
80      private static final String PROPERTY_XPATH_DATE = "comarquage.parser.xpath.public.date";
81      private static final String PROPERTY_XPATH_TITLE = "comarquage.parser.xpath.public.title";
82      private static final String PROPERTY_XPATH_THEME = "comarquage.parser.xpath.public.theme";
83      private static final String PROPERTY_XPATH_KEYWORDS = "comarquage.parser.xpath.public.keywords";
84  
85      // Index type
86      private static final String PROPERTY_INDEXING_TYPE = "comarquage-solr.indexing.publicType";
87  
88      // Site name
89      private static final String PROPERTY_SITE = "lutece.name";
90      private static final String PROPERTY_PROD_URL = "lutece.prod.url";
91  
92      // Paths contents
93      private static final String PROPERTY_PATH_ID = "comarquage.parser.path.id";
94  
95      // URL delimiter
96      private static final String PROPERTY_URL_DELIMITER = "comarquage.parser.url.public.delimiter";
97  
98      // Strings
99      private static final String STRING_EMPTY = "";
100     private static final String STRING_EQUAL = "=";
101     private static final String STRING_SLASH = "/";
102     private static final String STRING_SPACE = " ";
103     private static final String SHORT_NAME = "comgepub";
104     
105     // -------------
106     // - Variables -
107     // -------------
108     // List of Solr items
109     private List<SolrItem> _listSolrItems;
110 
111     // XPath
112     private String _strXPath;
113 
114     // Contents
115     private String _strUrl;
116     private String _strDate;
117     private String _strType;
118     private String _strTitle;
119     private String _strSite;
120     private String _strProdUrl;
121     private String _strTheme;
122     private String _strKeywords;
123 
124     /**
125      * Initializes and launches the parsing of the public cards (public constructor)
126      */
127     public CoMarquageSolrPublicParser(  )
128     {
129         // Gets the list of CDC index keys
130         String strCdcIndexKeys = AppPropertiesService.getProperty( PROPERTY_INDEXING_FRAGMENT +
131                 PROPERTY_LIST_CDC_INDEX_KEYS_FRAGMENT );
132 
133         // Initializes the Solr Item list
134         _listSolrItems = new ArrayList<SolrItem>(  );
135 
136         // Initializes the indexing type
137         _strType = AppPropertiesService.getProperty( PROPERTY_INDEXING_TYPE );
138 
139         // Initializes the site
140         _strSite = AppPropertiesService.getProperty( PROPERTY_SITE );
141 
142         // Initializes the prod url
143         _strProdUrl = AppPropertiesService.getProperty( PROPERTY_PROD_URL );
144 
145         if ( !_strProdUrl.endsWith( "/" ) )
146         {
147             _strProdUrl = _strProdUrl + "/";
148         }
149 
150         try
151         {
152             // Initializes the SAX parser
153             SAXParserFactory factory = SAXParserFactory.newInstance(  );
154             SAXParser parser = factory.newSAXParser(  );
155 
156             // Splits the list of CDC index keys
157             String[] splitKeys = strCdcIndexKeys.split( "," );
158 
159             for ( int i = 0; i < splitKeys.length; i++ )
160             {
161                 // Gets the XML index file path
162                 String strXmlFile = AppPropertiesService.getProperty( PROPERTY_INDEXING_FRAGMENT + splitKeys[i] );
163                 String strXmlPath = AppPathService.getPath( PROPERTY_INDEXING_XML_BASE_VAR, strXmlFile );
164 
165                 // Launches the parsing of this file (with the current handler)
166                 parser.parse( strXmlPath, this );
167             }
168         }
169         catch ( ParserConfigurationException e )
170         {
171             AppLogService.error( e.getMessage(  ), e );
172         }
173         catch ( SAXException e )
174         {
175             AppLogService.error( e.getMessage(  ), e );
176         }
177         catch ( IOException e )
178         {
179             AppLogService.error( e.getMessage(  ), e );
180         }
181     }
182 
183     /**
184      * Event received when starting the parsing operation
185      *
186      * @throws SAXException any SAX exception
187      */
188     public void startDocument(  ) throws SAXException
189     {
190         // Initializes the XPATH
191         _strXPath = STRING_EMPTY;
192 
193         // Initializes the contents
194         _strUrl = STRING_EMPTY;
195         _strDate = STRING_EMPTY;
196         _strTitle = STRING_EMPTY;
197         _strTheme = STRING_EMPTY;
198         _strKeywords = STRING_EMPTY;
199     }
200 
201     /**
202      * Event received at the end of the parsing operation
203      *
204      * @throws SAXException any SAX exception
205      */
206     public void endDocument(  ) throws SAXException
207     {
208         // Nothing to do
209     }
210 
211     /**
212      * Event received at the start of an element
213      *
214      * @param uri the Namespace URI
215      * @param localName the local name
216      * @param qName the qualified XML name
217      * @param atts the attributes attached to the element
218      *
219      * @throws SAXException any SAX exception
220      */
221     public void startElement( String uri, String localName, String qName, Attributes atts )
222         throws SAXException
223     {
224         // Updates the XPath
225         _strXPath += ( STRING_SLASH + qName );
226 
227         // Resets the contents
228         String strXPathCard = AppPropertiesService.getProperty( PROPERTY_XPATH_CARD );
229 
230         if ( ( _strXPath != null ) && _strXPath.equals( strXPathCard ) )
231         {
232             _strUrl = STRING_EMPTY;
233             _strTitle = STRING_EMPTY;
234             _strTheme = STRING_EMPTY;
235             _strKeywords = STRING_EMPTY;
236         }
237     }
238 
239     /**
240     * Event received at the end of an element
241     *
242     * @param uri the Namespace URI
243     * @param localName the local name
244     * @param qName the qualified XML name
245     *
246     * @throws SAXException any SAX exception
247     */
248     public void endElement( String uri, String localName, String qName )
249         throws SAXException
250     {
251         // If all the contents are retrieved (end of card)
252         String strXPathCard = AppPropertiesService.getProperty( PROPERTY_XPATH_CARD );
253 
254         if ( ( _strXPath != null ) && _strXPath.equals( strXPathCard ) )
255         {
256             // Sets the path
257             String strDelimiter = AppPropertiesService.getProperty( PROPERTY_URL_DELIMITER ) + STRING_EQUAL;
258             String strPath = _strUrl.split( strDelimiter )[1];
259 
260             // Sets the full URL
261             UrlItem url = new UrlItem( _strProdUrl );
262             url.addParameter( XPageAppService.PARAM_XPAGE_APP, AppPropertiesService.getProperty( PROPERTY_PLUGIN_NAME ) );
263             url.addParameter( AppPropertiesService.getProperty( PROPERTY_PATH_ID ), strPath );
264             
265             // Sets the contents
266             String strContents = _strTitle + STRING_SPACE + _strKeywords + STRING_SPACE + _strTheme;
267 
268             // Converts the date from "dd MMMMM yyyy" to "yyyyMMdd"
269             Locale locale = Locale.FRENCH;
270             Date dateUpdate = null;
271 
272             try
273             {
274                 SimpleDateFormat dateFormat = new SimpleDateFormat( "dd MMMMM yyyy", locale );
275                 dateUpdate = dateFormat.parse( _strDate );
276 
277                 dateFormat.applyPattern( "yyyyMMdd" );
278             }
279             catch ( ParseException e )
280             {
281                 dateUpdate = null;
282             }
283 
284             // Creates a new lucene document
285             SolrItem item = new SolrItem(  );
286 
287             // Sets the document fields
288             // * FIELD_URL		: stored and indexed (without the analyser)
289             // * FIELD_DATE		: stored and indexed (without the analyser)
290             // * FIELD_UID		: stored and not indexed (the UID already exists in the URL)
291             // * FIELD_CONTENTS	: not stored (saves disk space) and indexed (with the analyser)
292             // * FIELD_TITLE	: stored and not indexed (the title already exists in the contents)
293             // * FIELD_TYPE		: stored and indexed (without the analyser) -> allows to filter the search by type
294             item.setUrl( url.getUrl(  ) );
295             item.setDate( dateUpdate );
296             item.setUid( strPath + SolrConstants.CONSTANT_UNDERSCORE + SHORT_NAME );
297             item.setContent( strContents );
298             item.setTitle( _strTitle );
299             item.setType( _strType );
300             item.setSite( _strSite );
301 
302             // Adds the new item to the map
303             _listSolrItems.add( item );
304         }
305 
306         // Updates the XPath
307         _strXPath = _strXPath.substring( 0, _strXPath.lastIndexOf( STRING_SLASH ) );
308     }
309 
310     /**
311     * Event received when the analyzer encounters text (between two tags)
312     *
313     * @param ch the characters from the XML document
314     * @param start the start position in the array
315     * @param length the number of characters to read from the array
316     *
317     * @throws SAXException any SAX exception
318     */
319     public void characters( char[] ch, int start, int length )
320         throws SAXException
321     {
322         // Gets the XPath comparisons properties
323         String strXPathUrl = AppPropertiesService.getProperty( PROPERTY_XPATH_URL );
324         String strXPathDate = AppPropertiesService.getProperty( PROPERTY_XPATH_DATE );
325         String strXPathTitle = AppPropertiesService.getProperty( PROPERTY_XPATH_TITLE );
326         String strXPathTheme = AppPropertiesService.getProperty( PROPERTY_XPATH_THEME );
327         String strXPathKeywords = AppPropertiesService.getProperty( PROPERTY_XPATH_KEYWORDS );
328 
329         // Gets the URL
330         if ( ( _strXPath != null ) && _strXPath.equals( strXPathUrl ) )
331         {
332             _strUrl += new String( ch, start, length );
333         }
334 
335         // Gets the date
336         else if ( ( _strXPath != null ) && _strXPath.equals( strXPathDate ) )
337         {
338             _strDate += new String( ch, start, length );
339         }
340 
341         // Gets the title
342         else if ( ( _strXPath != null ) && _strXPath.equals( strXPathTitle ) )
343         {
344             _strTitle += new String( ch, start, length );
345         }
346 
347         // Gets the theme
348         else if ( ( _strXPath != null ) && _strXPath.equals( strXPathTheme ) )
349         {
350             if ( ( _strTheme != null ) && !_strTheme.equals( STRING_EMPTY ) )
351             {
352                 _strTheme += ( STRING_SPACE + new String( ch, start, length ) );
353             }
354             else
355             {
356                 _strTheme += new String( ch, start, length );
357             }
358         }
359 
360         // Gets the keywords
361         else if ( ( _strXPath != null ) && _strXPath.equals( strXPathKeywords ) )
362         {
363             _strKeywords += new String( ch, start, length );
364         }
365     }
366 
367     /**
368      * Gets the list of Solr items
369      *
370      * @return The list of Solr items
371      */
372     public List<SolrItem> getPublicSolrItems(  )
373     {
374         return _listSolrItems;
375     }
376 }