View Javadoc
1   /*
2    * Copyright (c) 2007 Yaroslav Stavnichiy, yarosla@gmail.com
3    *
4    * Latest version of this software can be obtained from:
5    *   http://web-tec.info/WikiParser
6    *
7    * Permission is hereby granted, free of charge, to any person obtaining a copy
8    * of this software and associated documentation files (the "Software"), to deal
9    * in the Software without restriction, including without limitation the rights
10   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11   * copies of the Software, and to permit persons to whom the Software is
12   * furnished to do so, subject to the following conditions:
13   *
14   * The above copyright notice and this permission notice shall be included in all
15   * copies or substantial portions of the Software.
16  
17   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23   * SOFTWARE.
24   *
25   * If you make use of this code, I'd appreciate hearing about it.
26   * Comments, suggestions, and bug reports welcome: yarosla@gmail.com
27   */
28  package ys.wikiparser;
29  
30  import java.util.*;
31  
32  public class Utils
33  {
34      private static HashMap<String, Character> entities = null;
35      private static final String translitTable = "àaábâvãgädåe¸eæzhçzèiéyêkëlìmínîoïpðrñsòtóuôfõhöts÷chøshùschüûyúýeþyuÿyaÀAÁBÂVÃGÄDÅE¨EÆZHÇZÈIÉYÊKËLÌMÍNÎOÏPÐRÑSÒTÓUÔFÕHÖTS×CHØSHÙSCHÜÛYÚÝEÞYUßYA";
36  
37      public static boolean isUrlChar( char c )
38      {
39          // From MediaWiki: "._\\/~%-+&#?!=()@"
40          // From http://www.ietf.org/rfc/rfc2396.txt :
41          // reserved: ";/?:@&=+$,"
42          // unreserved: "-_.!~*'()"
43          // delim: "%#"
44          if ( isLatinLetterOrDigit( c ) )
45          {
46              return true;
47          }
48  
49          return "/?@&=+,-_.!~()%#;:$*".indexOf( c ) >= 0; // I excluded '\''
50      }
51  
52      public static boolean isLatinLetterOrDigit( char c )
53      {
54          return ( ( c >= 'a' ) && ( c <= 'z' ) ) || ( ( c >= 'A' ) && ( c <= 'Z' ) ) || ( ( c >= '0' ) && ( c <= '9' ) );
55      }
56  
57      /**
58       * Filters text so there are no '\r' chars in it ("\r\n" -&gt; "\n"; then "\r" -&gt; "\n"). Most importantly makes all blank lines (lines with only spaces)
59       * exactly like this: "\n\n". WikiParser relies on that.
60       *
61       * @param text
62       * @return filtered text
63       */
64      public static String preprocessWikiText( String text )
65      {
66          if ( text == null )
67          {
68              return "";
69          }
70  
71          text = text.trim( );
72  
73          int length = text.length( );
74          char [ ] chars = new char [ length];
75          text.getChars( 0, length, chars, 0 );
76  
77          StringBuilder sb = new StringBuilder( );
78          boolean blankLine = true;
79          StringBuilder spaces = new StringBuilder( );
80  
81          for ( int p = 0; p < length; p++ )
82          {
83              char c = chars [p];
84  
85              if ( c == '\r' )
86              { // "\r\n" -> "\n"; then "\r" -> "\n"
87  
88                  if ( ( ( p + 1 ) < length ) && ( chars [p + 1] == '\n' ) )
89                  {
90                      p++;
91                  }
92  
93                  sb.append( '\n' );
94                  spaces.delete( 0, spaces.length( ) ); // discard spaces if there is nothing else on the line
95                  blankLine = true;
96              }
97              else
98                  if ( c == '\n' )
99                  {
100                     sb.append( c );
101                     spaces.delete( 0, spaces.length( ) ); // discard spaces if there is nothing else on the line
102                     blankLine = true;
103                 }
104                 else
105                     if ( blankLine )
106                     {
107                         if ( c <= ' ' /* && c!='\n' */)
108                         {
109                             spaces.append( c );
110                         }
111                         else
112                         {
113                             sb.append( spaces );
114                             blankLine = false;
115                             sb.append( c );
116                         }
117                     }
118                     else
119                     {
120                         sb.append( c );
121                     }
122         }
123 
124         return sb.toString( );
125     }
126 
127     public static String escapeHTML( String s )
128     {
129         if ( s == null )
130         {
131             return "";
132         }
133 
134         StringBuffer sb = new StringBuffer( s.length( ) + 100 );
135         int length = s.length( );
136 
137         for ( int i = 0; i < length; i++ )
138         {
139             char ch = s.charAt( i );
140 
141             if ( '<' == ch )
142             {
143                 sb.append( "&lt;" );
144             }
145             else
146                 if ( '>' == ch )
147                 {
148                     sb.append( "&gt;" );
149                 }
150                 else
151                     if ( '&' == ch )
152                     {
153                         sb.append( "&amp;" );
154                     }
155                     else
156                         if ( '\'' == ch )
157                         {
158                             sb.append( "&#39;" );
159                         }
160                         else
161                             if ( '"' == ch )
162                             {
163                                 sb.append( "&quot;" );
164                             }
165                             else
166                             {
167                                 sb.append( ch );
168                             }
169         }
170 
171         return sb.toString( );
172     }
173 
174     private static synchronized HashMap<String, Character> getHtmlEntities( )
175     {
176         if ( entities == null )
177         {
178             entities = new HashMap<String, Character>( );
179             entities.put( "lt", '<' );
180             entities.put( "gt", '>' );
181             entities.put( "amp", '&' );
182             entities.put( "quot", '"' );
183             entities.put( "apos", '\'' );
184             entities.put( "nbsp", '\u00A0' );
185             entities.put( "shy", '\u00AD' );
186             entities.put( "copy", '\u00A9' );
187             entities.put( "reg", '\u00AE' );
188             entities.put( "trade", '\u2122' );
189             entities.put( "mdash", '\u2014' );
190             entities.put( "ndash", '\u2013' );
191             entities.put( "ldquo", '\u201C' );
192             entities.put( "rdquo", '\u201D' );
193             entities.put( "euro", '\u20AC' );
194             entities.put( "middot", '\u00B7' );
195             entities.put( "bull", '\u2022' );
196             entities.put( "laquo", '\u00AB' );
197             entities.put( "raquo", '\u00BB' );
198         }
199 
200         return entities;
201     }
202 
203     public static String unescapeHTML( String value )
204     {
205         if ( value == null )
206         {
207             return null;
208         }
209 
210         if ( value.indexOf( '&' ) < 0 )
211         {
212             return value;
213         }
214 
215         HashMap<String, Character> ent = getHtmlEntities( );
216         StringBuffer sb = new StringBuffer( );
217         final int length = value.length( );
218 
219         for ( int i = 0; i < length; i++ )
220         {
221             char c = value.charAt( i );
222 
223             if ( c == '&' )
224             {
225                 char ce = 0;
226                 int i1 = value.indexOf( ';', i + 1 );
227 
228                 if ( ( i1 > i ) && ( ( i1 - i ) <= 12 ) )
229                 {
230                     if ( value.charAt( i + 1 ) == '#' )
231                     {
232                         if ( value.charAt( i + 2 ) == 'x' )
233                         {
234                             ce = (char) atoi( value.substring( i + 3, i1 ), 16 );
235                         }
236                         else
237                         {
238                             ce = (char) atoi( value.substring( i + 2, i1 ) );
239                         }
240                     }
241                     else
242                     {
243                         synchronized( ent )
244                         {
245                             Character ceObj = ent.get( value.substring( i + 1, i1 ) );
246                             ce = ( ceObj == null ) ? 0 : ceObj.charValue( );
247                         }
248                     }
249                 }
250 
251                 if ( ce > 0 )
252                 {
253                     sb.append( ce );
254                     i = i1;
255                 }
256                 else
257                 {
258                     sb.append( c );
259                 }
260             }
261             else
262             {
263                 sb.append( c );
264             }
265         }
266 
267         return sb.toString( );
268     }
269 
270     static public int atoi( String s )
271     {
272         try
273         {
274             return Integer.parseInt( s );
275         }
276         catch( Throwable ex )
277         {
278             return 0;
279         }
280     }
281 
282     static public int atoi( String s, int base )
283     {
284         try
285         {
286             return Integer.parseInt( s, base );
287         }
288         catch( Throwable ex )
289         {
290             return 0;
291         }
292     }
293 
294     public static String replaceString( String str, String from, String to )
295     {
296         StringBuffer buf = new StringBuffer( );
297         int flen = from.length( );
298         int i1 = 0;
299         int i2 = 0;
300 
301         while ( ( i2 = str.indexOf( from, i1 ) ) >= 0 )
302         {
303             buf.append( str.substring( i1, i2 ) );
304             buf.append( to );
305             i1 = i2 + flen;
306         }
307 
308         buf.append( str.substring( i1 ) );
309 
310         return buf.toString( );
311     }
312 
313     public static String [ ] split( String s, char separator )
314     {
315         // this is meant to be faster than String.split() when separator is not regexp
316         if ( s == null )
317         {
318             return null;
319         }
320 
321         ArrayList<String> parts = new ArrayList<String>( );
322         int beginIndex = 0;
323         int endIndex;
324 
325         while ( ( endIndex = s.indexOf( separator, beginIndex ) ) >= 0 )
326         {
327             parts.add( s.substring( beginIndex, endIndex ) );
328             beginIndex = endIndex + 1;
329         }
330 
331         parts.add( s.substring( beginIndex ) );
332 
333         String [ ] a = new String [ parts.size( )];
334 
335         return parts.toArray( a );
336     }
337 
338     /**
339      * Translates all non-basic-latin-letters characters into latin ones for use in URLs etc. Here is the implementation for cyrillic (Russian) alphabet.
340      * Unknown characters are omitted.
341      *
342      * @param s
343      *            string to be translated
344      * @return translated string
345      */
346     public static String translit( String s )
347     {
348         if ( s == null )
349         {
350             return "";
351         }
352 
353         StringBuilder sb = new StringBuilder( s.length( ) + 100 );
354         final int length = s.length( );
355         final int translitTableLength = translitTable.length( );
356 
357         for ( int i = 0; i < length; i++ )
358         {
359             char ch = s.charAt( i );
360 
361             // System.err.println("ch="+(int)ch);
362             if ( ( ( ch >= 'à' ) && ( ch <= 'ÿ' ) ) || ( ( ch >= 'À' ) && ( ch <= 'ß' ) ) || ( ch == '¸' ) || ( ch == '¨' ) )
363             {
364                 int idx = translitTable.indexOf( ch );
365                 char c;
366 
367                 if ( idx >= 0 )
368                 {
369                     for ( idx++; idx < translitTableLength; idx++ )
370                     {
371                         c = translitTable.charAt( idx );
372 
373                         if ( ( ( c >= 'à' ) && ( c <= 'ÿ' ) ) || ( ( c >= 'À' ) && ( c <= 'ß' ) ) || ( c == '¸' ) || ( c == '¨' ) )
374                         {
375                             break;
376                         }
377 
378                         sb.append( c );
379                     }
380                 }
381             }
382             else
383             {
384                 sb.append( ch );
385             }
386         }
387 
388         return sb.toString( );
389     }
390 
391     public static String emptyToNull( String s )
392     {
393         return "".equals( s ) ? null : s;
394     }
395 
396     public static String noNull( String s )
397     {
398         return ( s == null ) ? "" : s;
399     }
400 
401     public static String noNull( String s, String val )
402     {
403         return ( s == null ) ? val : s;
404     }
405 
406     public static boolean isEmpty( String s )
407     {
408         return ( ( s == null ) || ( s.length( ) == 0 ) );
409     }
410 }