1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.util.mail;
35
36 import fr.paris.lutece.portal.service.util.AppLogService;
37
38 import org.w3c.dom.Document;
39 import org.w3c.dom.NamedNodeMap;
40 import org.w3c.dom.Node;
41 import org.w3c.dom.NodeList;
42
43 import org.w3c.tidy.Tidy;
44
45 import java.io.ByteArrayInputStream;
46
47 import java.net.MalformedURLException;
48 import java.net.URL;
49
50 import java.util.ArrayList;
51 import java.util.HashMap;
52 import java.util.List;
53 import java.util.Map;
54
55 import javax.activation.DataHandler;
56
57
58
59
60 public class HtmlDocument
61 {
62
63
64
65
66
67
68
69
70
71 public static final ElementUrl ELEMENT_CSS;
72
73
74
75
76
77
78
79
80
81 public static final ElementUrl ELEMENT_JAVASCRIPT;
82
83
84
85
86
87
88
89
90 public static final ElementUrl ELEMENT_IMG;
91
92 static
93 {
94 ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
95 ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
96 ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
97 }
98
99 private Document _content;
100 private String _strBaseUrl;
101 private boolean _useAbsoluteUrl;
102
103
104
105
106
107
108
109
110
111
112
113 public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
114 {
115
116 Tidy tidy = new Tidy( );
117 tidy.setQuiet( true );
118 tidy.setShowWarnings( false );
119
120 _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
121 _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
122 _useAbsoluteUrl = useAbsoluteUrl;
123 }
124
125
126
127
128
129
130
131
132 public Map<String, URL> getAllUrls( ElementUrl elementType )
133 {
134 Map<String, URL> mapUrl = new HashMap<>( );
135
136 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
137
138 for ( int i = 0; i < nodes.getLength( ); i++ )
139 {
140 Node node = nodes.item( i );
141 NamedNodeMap attributes = node.getAttributes( );
142
143
144 if ( elementType.getTestedAttributeName( ) != null )
145 {
146 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
147
148 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
149 {
150 continue;
151 }
152 }
153
154
155 String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
156
157 if ( strSrc.startsWith( _strBaseUrl ) )
158 {
159 try
160 {
161 URL url = new URL( strSrc );
162 mapUrl.put( getUrlName( url ), url );
163 }
164 catch( MalformedURLException e )
165 {
166
167 AppLogService.info( " {} not found, location ignored.", strSrc );
168 }
169 }
170 }
171
172 return mapUrl;
173 }
174
175
176
177
178
179
180
181
182 public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
183 {
184 List<UrlAttachment> listUrlAttachement = new ArrayList<>( );
185 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
186
187 for ( int i = 0; i < nodes.getLength( ); i++ )
188 {
189 Node node = nodes.item( i );
190 NamedNodeMap attributes = node.getAttributes( );
191
192
193 if ( elementType.getTestedAttributeName( ) != null )
194 {
195 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
196
197 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
198 {
199 continue;
200 }
201 }
202
203
204 String strAttributeName = elementType.getAttributeName( );
205
206 if ( ( strAttributeName != null ) && ( attributes != null ) )
207 {
208 Node attributeNode = attributes.getNamedItem( strAttributeName );
209 createAttributeUrl( attributeNode, listUrlAttachement );
210 }
211 }
212
213 return listUrlAttachement;
214 }
215
216 private void createAttributeUrl( Node attributeNode, List<UrlAttachment> listUrlAttachement )
217 {
218 if ( attributeNode != null )
219 {
220 String strSrc = attributeNode.getNodeValue( );
221
222 if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
223 {
224 try
225 {
226 URL url = new URL( strSrc );
227 UrlAttachmenthtml#UrlAttachment">UrlAttachment urlAttachement = new UrlAttachment( getUrlName( url ), url );
228 listUrlAttachement.add( urlAttachement );
229 }
230 catch( MalformedURLException e )
231 {
232
233 AppLogService.info( " {} not found, location ignored.", strSrc );
234 }
235 }
236 }
237 }
238
239
240
241
242
243
244
245
246 protected Object getUrlContent( URL url )
247 {
248 return new DataHandler( url );
249 }
250
251
252
253
254
255
256
257
258 protected String getUrlName( URL url )
259 {
260 return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
261 }
262
263
264
265
266 private static class ElementUrl
267 {
268 private String _strTagName;
269 private String _strAttributeName;
270 private String _strTestedAttributeName;
271 private String _strTestedAttributeValue;
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286 public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName, String strTestedAttributeValue )
287 {
288 _strTagName = strTagName;
289 _strAttributeName = strAttributeName;
290 _strTestedAttributeName = strTestedAttributeName;
291 _strTestedAttributeValue = strTestedAttributeValue;
292 }
293
294
295
296
297
298
299 public String getAttributeName( )
300 {
301 return _strAttributeName;
302 }
303
304
305
306
307
308
309 public String getTagName( )
310 {
311 return _strTagName;
312 }
313
314
315
316
317
318
319 public String getTestedAttributeName( )
320 {
321 return _strTestedAttributeName;
322 }
323
324
325
326
327
328
329 public String getTestedAttributeValue( )
330 {
331 return _strTestedAttributeValue;
332 }
333 }
334 }