1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 package fr.paris.lutece.util.mail;
35
36 import fr.paris.lutece.portal.service.util.AppLogService;
37
38 import org.w3c.dom.Document;
39 import org.w3c.dom.NamedNodeMap;
40 import org.w3c.dom.Node;
41 import org.w3c.dom.NodeList;
42
43 import org.w3c.tidy.Tidy;
44
45 import java.io.ByteArrayInputStream;
46
47 import java.net.MalformedURLException;
48 import java.net.URL;
49
50 import java.util.ArrayList;
51 import java.util.HashMap;
52 import java.util.List;
53 import java.util.Map;
54
55 import javax.activation.DataHandler;
56
57
58
59
60
61
62 public class HtmlDocument
63 {
64
65
66
67
68
69
70
71
72
73 public static final ElementUrl ELEMENT_CSS;
74
75
76
77
78
79
80
81
82
83 public static final ElementUrl ELEMENT_JAVASCRIPT;
84
85
86
87
88
89
90
91
92 public static final ElementUrl ELEMENT_IMG;
93
94 static
95 {
96 ELEMENT_CSS = new ElementUrl( "link", "href", "rel", "stylesheet" );
97 ELEMENT_JAVASCRIPT = new ElementUrl( "script", "src", "type", "text/javascript" );
98 ELEMENT_IMG = new ElementUrl( "img", "src", null, null );
99 }
100
101 private Document _content;
102 private String _strBaseUrl;
103 private boolean _useAbsoluteUrl;
104
105
106
107
108
109
110
111
112 public HtmlDocument( String strHtml, String strBaseUrl, boolean useAbsoluteUrl )
113 {
114
115 Tidy tidy = new Tidy( );
116 tidy.setQuiet( true );
117 tidy.setShowWarnings( false );
118
119 _content = tidy.parseDOM( new ByteArrayInputStream( strHtml.getBytes( ) ), null );
120 _strBaseUrl = ( strBaseUrl == null ) ? "" : strBaseUrl;
121 _useAbsoluteUrl = useAbsoluteUrl;
122 }
123
124
125
126
127
128
129
130 public Map<String, URL> getAllUrls( ElementUrl elementType )
131 {
132 Map<String, URL> mapUrl = new HashMap<String, URL>( );
133
134 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
135
136 for ( int i = 0; i < nodes.getLength( ); i++ )
137 {
138 Node node = nodes.item( i );
139 NamedNodeMap attributes = node.getAttributes( );
140
141
142 if ( elementType.getTestedAttributeName( ) != null )
143 {
144 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
145
146 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
147 {
148 continue;
149 }
150 }
151
152
153 String strSrc = attributes.getNamedItem( elementType.getAttributeName( ) ).getNodeValue( );
154
155 if ( strSrc.startsWith( _strBaseUrl ) )
156 {
157 try
158 {
159 URL url = new URL( strSrc );
160 mapUrl.put( getUrlName( url ), url );
161 }
162 catch ( MalformedURLException e )
163 {
164
165 AppLogService.info( strSrc + " not found, location ignored." );
166 }
167 }
168 }
169
170 return mapUrl;
171 }
172
173
174
175
176
177
178
179 public List<UrlAttachment> getAllUrlsAttachement( ElementUrl elementType )
180 {
181 List<UrlAttachment> listUrlAttachement = new ArrayList<UrlAttachment>( );
182 UrlAttachment urlAttachement;
183 NodeList nodes = _content.getElementsByTagName( elementType.getTagName( ) );
184
185 for ( int i = 0; i < nodes.getLength( ); i++ )
186 {
187 Node node = nodes.item( i );
188 NamedNodeMap attributes = node.getAttributes( );
189
190
191 if ( elementType.getTestedAttributeName( ) != null )
192 {
193 String strRel = attributes.getNamedItem( elementType.getTestedAttributeName( ) ).getNodeValue( );
194
195 if ( !elementType.getTestedAttributeValue( ).equals( strRel ) )
196 {
197 continue;
198 }
199 }
200
201
202 String strAttributeName = elementType.getAttributeName( );
203
204 if ( ( strAttributeName != null ) && ( attributes != null ) )
205 {
206 Node attributeNode = attributes.getNamedItem( strAttributeName );
207
208 if ( attributeNode != null )
209 {
210 String strSrc = attributeNode.getNodeValue( );
211
212 if ( ( strSrc != null ) && strSrc.startsWith( _strBaseUrl ) )
213 {
214 try
215 {
216 URL url = new URL( strSrc );
217 urlAttachement = new UrlAttachment( getUrlName( url ), url );
218 listUrlAttachement.add( urlAttachement );
219 }
220 catch ( MalformedURLException e )
221 {
222
223 AppLogService.info( strSrc + " not found, location ignored." );
224 }
225 }
226 }
227 }
228 }
229
230 return listUrlAttachement;
231 }
232
233
234
235
236
237
238
239 protected Object getUrlContent( URL url )
240 {
241 return new DataHandler( url );
242 }
243
244
245
246
247
248
249 protected String getUrlName( URL url )
250 {
251 return _useAbsoluteUrl ? url.toExternalForm( ) : url.getPath( );
252 }
253
254
255
256
257 private static class ElementUrl
258 {
259 private String _strTagName;
260 private String _strAttributeName;
261 private String _strTestedAttributeName;
262 private String _strTestedAttributeValue;
263
264
265
266
267
268
269
270
271
272
273
274 public ElementUrl( String strTagName, String strAttributeName, String strTestedAttributeName,
275 String strTestedAttributeValue )
276 {
277 _strTagName = strTagName;
278 _strAttributeName = strAttributeName;
279 _strTestedAttributeName = strTestedAttributeName;
280 _strTestedAttributeValue = strTestedAttributeValue;
281 }
282
283
284
285
286
287 public String getAttributeName( )
288 {
289 return _strAttributeName;
290 }
291
292
293
294
295
296 public String getTagName( )
297 {
298 return _strTagName;
299 }
300
301
302
303
304
305 public String getTestedAttributeName( )
306 {
307 return _strTestedAttributeName;
308 }
309
310
311
312
313
314 public String getTestedAttributeValue( )
315 {
316 return _strTestedAttributeValue;
317 }
318 }
319 }