View Javadoc
1   /*
2    * Copyright 2007-2009 Yaroslav Stavnichiy, yarosla@gmail.com
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   * Latest version of this software can be obtained from:
17   *
18   *     http://t4-wiki-parser.googlecode.com
19   *
20   * If you make use of this code, I'd appreciate hearing about it.
21   * Comments, suggestions, and bug reports welcome: yarosla@gmail.com
22   */
23  package ys.wikiparser;
24  
25  import static ys.wikiparser.Utils.*;
26  
27  import java.net.*;
28  
29  import java.util.HashSet;
30  
31  /**
32   * WikiParser.renderXHTML() is the main method of this class. It takes wiki-text and returns XHTML.
33   *
34   * WikiParser's behavior can be customized by overriding appendXxx() methods, which should make integration of this class into any wiki/blog/forum software easy
35   * and painless.
36   *
37   * @author Yaroslav Stavnichiy (yarosla@gmail.com)
38   *
39   */
40  public class WikiParser
41  {
42      private static final int MAX_LIST_LEVELS = 100;
43      private static final String [ ] ESCAPED_INLINE_SEQUENCES = {
44              "{{{", "{{", "}}}", "**", "//", "__", "##", "\\\\", "[[", "<<<", "~", "--", "|"
45      };
46      private static final String LIST_CHARS = "*-#>:!";
47      private static final String [ ] LIST_OPEN = {
48              "<ul><li>", "<ul><li>", "<ol><li>", "<blockquote>", "<div class='wiki_indent'>", "<div class='wiki_center'>"
49      };
50      private static final String [ ] LIST_CLOSE = {
51              "</li></ul>\n", "</li></ul>\n", "</li></ol>\n", "</blockquote>\n", "</div>\n", "</div>\n"
52      };
53      private static final String FORMAT_CHARS = "*/_#";
54      private static final String [ ] FORMAT_DELIM = {
55              "**", "//", "__", "##"
56      };
57      private static final String [ ] FORMAT_TAG_OPEN = {
58              "<strong>", "<em>", "<span class=\"wiki_underline\">", "<tt>"
59      };
60      private static final String [ ] FORMAT_TAG_CLOSE = {
61              "</strong>", "</em>", "</span>", "</tt>"
62      };
63      private int wikiLength;
64      private char [ ] wikiChars;
65      protected StringBuilder sb = new StringBuilder( );
66      protected StringBuilder toc = new StringBuilder( );
67      protected int tocLevel = 0;
68      private HashSet<String> tocAnchorIds = new HashSet<String>( );
69      private String wikiText;
70      private int pos = 0;
71      private int listLevel = -1;
72      private char [ ] listLevels = new char [ MAX_LIST_LEVELS + 1]; // max number of levels allowed
73      private boolean blockquoteBR = false;
74      private boolean inTable = false;
75      private int mediawikiTableLevel = 0;
76      protected int HEADING_LEVEL_SHIFT = 1; // make =h2, ==h3, ...
77      protected String HEADING_ID_PREFIX = null;
78      private String _strTableClass = "";
79      private String _strParentTableClass = "";
80      private String _strTocClass = "toc";
81  
82      protected WikiParser( )
83      {
84          // for use by subclasses only
85          // subclasses should call parse() to complete construction
86      }
87  
88      protected WikiParser( String wikiText )
89      {
90          parse( wikiText );
91      }
92  
93      protected void setTableClass( String strClass )
94      {
95          _strTableClass = strClass;
96      }
97  
98  
99      protected void setParentTableClass( String strParentClass )
100     {
101         _strParentTableClass = strParentClass;
102     }
103 
104 
105     protected void setTocClass( String strClass )
106     {
107         _strTocClass = strClass;
108     }
109 
110     public static String renderXHTML( String wikiText )
111     {
112         return new WikiParser( wikiText ).toString( );
113     }
114 
115     protected void parse( String wikiText )
116     {
117         wikiText = preprocessWikiText( wikiText );
118 
119         this.wikiText = wikiText;
120         wikiLength = this.wikiText.length( );
121         wikiChars = new char [ wikiLength];
122         this.wikiText.getChars( 0, wikiLength, wikiChars, 0 );
123 
124         while ( parseBlock( ) )
125             ;
126 
127         closeListsAndTables( );
128 
129         while ( mediawikiTableLevel-- > 0 )
130             sb.append( "</td></tr></table></div>\n" );
131 
132         completeTOC( );
133     }
134 
135     public String toString( )
136     {
137         return sb.toString( );
138     }
139 
140     private void closeListsAndTables( )
141     {
142         // close unclosed lists
143         while ( listLevel >= 0 )
144         {
145             sb.append( LIST_CLOSE [LIST_CHARS.indexOf( listLevels [listLevel--] )] );
146         }
147 
148         if ( inTable )
149         {
150             sb.append( "</table></div>\n" );
151             inTable = false;
152         }
153     }
154 
155     private boolean parseBlock( )
156     {
157         for ( ; ( pos < wikiLength ) && ( wikiChars [pos] <= ' ' ) && ( wikiChars [pos] != '\n' ); pos++ )
158             ; // skip whitespace
159 
160         if ( pos >= wikiLength )
161         {
162             return false;
163         }
164 
165         char c = wikiChars [pos];
166 
167         if ( c == '\n' )
168         { // blank line => end of list/table; no other meaning
169             closeListsAndTables( );
170             pos++;
171 
172             return true;
173         }
174 
175         if ( c == '|' )
176         { // table
177 
178             if ( mediawikiTableLevel > 0 )
179             {
180                 int pp = pos + 1;
181 
182                 if ( pp < wikiLength )
183                 {
184                     boolean newRow = false;
185                     boolean endTable = false;
186 
187                     if ( wikiChars [pp] == '-' )
188                     { // mediawiki-table new row
189                         newRow = true;
190                         pp++;
191                     }
192                     else
193                         if ( wikiChars [pp] == '}' )
194                         { // mediawiki-table end table
195                             endTable = true;
196                             pp++;
197                         }
198 
199                     for ( ; ( pp < wikiLength ) && ( ( wikiChars [pp] == ' ' ) || ( wikiChars [pp] == '\t' ) ); pp++ )
200                         ; // skip spaces
201 
202                     if ( ( pp == wikiLength ) || ( wikiChars [pp] == '\n' ) )
203                     { // nothing else on the line => it's mediawiki-table markup
204                         closeListsAndTables( ); // close lists if any
205                         sb.append( newRow ? "</td></tr>\n<tr><td>" : ( endTable ? "</td></tr></table></div>\n" : "</td>\n<td>" ) );
206 
207                         if ( endTable )
208                         {
209                             mediawikiTableLevel--;
210                         }
211 
212                         pos = pp + 1;
213 
214                         return pp < wikiLength;
215                     }
216                 }
217             }
218 
219             if ( !inTable )
220             {
221                 closeListsAndTables( ); // close lists if any
222                 sb.append( "<div class=\"").append( _strParentTableClass ).append( "\" >" ).append("<table class=\"" ).append( _strTableClass ).append( "\" >" );
223                 inTable = true;
224             }
225 
226             pos = parseTableRow( pos + 1 );
227 
228             return true;
229         }
230         else
231         {
232             if ( inTable )
233             {
234                 sb.append( "</table></div>\n" );
235                 inTable = false;
236             }
237         }
238 
239         if ( ( listLevel >= 0 ) || ( LIST_CHARS.indexOf( c ) >= 0 ) )
240         { // lists
241 
242             int lc;
243 
244             // count list level
245             for ( lc = 0; ( lc <= listLevel ) && ( ( pos + lc ) < wikiLength ) && ( wikiChars [pos + lc] == listLevels [lc] ); lc++ )
246                 ;
247 
248             if ( lc <= listLevel )
249             { // end list block(s)
250 
251                 do
252                 {
253                     sb.append( LIST_CLOSE [LIST_CHARS.indexOf( listLevels [listLevel--] )] );
254                 }
255                 while ( lc <= listLevel );
256 
257                 // list(s) closed => retry from the same position
258                 blockquoteBR = true;
259 
260                 return true;
261             }
262             else
263             {
264                 if ( ( pos + lc ) >= wikiLength )
265                 {
266                     return false;
267                 }
268 
269                 char cc = wikiChars [pos + lc];
270                 int listType = LIST_CHARS.indexOf( cc );
271 
272                 if ( ( listType >= 0 ) && ( ( pos + lc + 1 ) < wikiLength ) && ( wikiChars [pos + lc + 1] != cc ) && ( listLevel < MAX_LIST_LEVELS ) )
273                 { // new list block
274                     sb.append( LIST_OPEN [listType] );
275                     listLevels [++listLevel] = cc;
276                     blockquoteBR = true;
277                     pos = parseListItem( pos + lc + 1 );
278 
279                     return true;
280                 }
281                 else
282                     if ( listLevel >= 0 )
283                     { // list item - same level
284 
285                         if ( ( listLevels [listLevel] == '>' ) || ( listLevels [listLevel] == ':' ) )
286                         {
287                             sb.append( '\n' );
288                         }
289                         else
290                             if ( listLevels [listLevel] == '!' )
291                             {
292                                 sb.append( "</div>\n<div class='wiki_center'>" );
293                             }
294                             else
295                             {
296                                 sb.append( "</li>\n<li>" );
297                             }
298 
299                         pos = parseListItem( pos + lc );
300 
301                         return true;
302                     }
303             }
304         }
305 
306         if ( c == '=' )
307         { // heading
308 
309             int hc;
310 
311             // count heading level
312             for ( hc = 1; ( hc < 6 ) && ( ( pos + hc ) < wikiLength ) && ( wikiChars [pos + hc] == '=' ); hc++ )
313                 ;
314 
315             if ( ( pos + hc ) >= wikiLength )
316             {
317                 return false;
318             }
319 
320             int p;
321 
322             for ( p = pos + hc; ( p < wikiLength ) && ( ( wikiChars [p] == ' ' ) || ( wikiChars [p] == '\t' ) ); p++ )
323                 ; // skip spaces
324 
325             String tagName = "h" + ( hc + HEADING_LEVEL_SHIFT );
326             sb.append( "<" + tagName + " id=''>" ); // real id to be inserted after parsing this item
327 
328             int hStart = sb.length( );
329             pos = parseItem( p, wikiText.substring( pos, pos + hc ), ContextType.HEADER );
330 
331             String hText = sb.substring( hStart, sb.length( ) );
332             sb.append( "</" + tagName + ">\n" );
333 
334             String anchorId = generateTOCAnchorId( hc, hText );
335             sb.insert( hStart - 2, anchorId );
336             appendTOCItem( hc, anchorId, hText );
337 
338             return true;
339         }
340         else
341             if ( c == '{' )
342             { // nowiki-block?
343 
344                 if ( ( ( pos + 2 ) < wikiLength ) && ( wikiChars [pos + 1] == '{' ) && ( wikiChars [pos + 2] == '{' ) )
345                 {
346                     int startNowiki = pos + 3;
347                     int endNowiki = findEndOfNowiki( startNowiki );
348                     int endPos = endNowiki + 3;
349 
350                     if ( wikiText.lastIndexOf( '\n', endNowiki ) >= startNowiki )
351                     { // block <pre>
352 
353                         if ( wikiChars [startNowiki] == '\n' )
354                         {
355                             startNowiki++; // skip the very first '\n'
356                         }
357 
358                         if ( wikiChars [endNowiki - 1] == '\n' )
359                         {
360                             endNowiki--; // omit the very last '\n'
361                         }
362 
363                         // sb.append( "<pre>" );
364                         appendNowiki( wikiText.substring( startNowiki, endNowiki ) );
365                         // sb.append( "</pre>\n" );
366                         pos = endPos;
367 
368                         return true;
369                     }
370 
371                     // else inline <nowiki> - proceed to regular paragraph handling
372                 }
373                 else
374                     if ( ( ( pos + 1 ) < wikiLength ) && ( wikiChars [pos + 1] == '|' ) )
375                     { // mediawiki-table?
376 
377                         int pp;
378 
379                         for ( pp = pos + 2; ( pp < wikiLength ) && ( ( wikiChars [pp] == ' ' ) || ( wikiChars [pp] == '\t' ) ); pp++ )
380                             ; // skip spaces
381 
382                         if ( ( pp == wikiLength ) || ( wikiChars [pp] == '\n' ) )
383                         { // yes, it's start of a table
384                             sb.append( "<div class=\"").append( _strParentTableClass ).append( "\" >" ).append("<table class=\"" ).append( _strTableClass ).append( "\"><tr><td>" );
385                             mediawikiTableLevel++;
386                             pos = pp + 1;
387 
388                             return pp < wikiLength;
389                         }
390                     }
391             }
392             else
393                 if ( ( c == '-' ) && wikiText.startsWith( "----", pos ) )
394                 {
395                     int p;
396 
397                     for ( p = pos + 4; ( p < wikiLength ) && ( ( wikiChars [p] == ' ' ) || ( wikiChars [p] == '\t' ) ); p++ )
398                         ; // skip spaces
399 
400                     if ( ( p == wikiLength ) || ( wikiChars [p] == '\n' ) )
401                     {
402                         sb.append( "\n<hr/>\n" );
403                         pos = p;
404 
405                         return true;
406                     }
407                 }
408                 else
409                     if ( c == '~' )
410                     { // block-level escaping: '*' '-' '#' '>' ':' '!' '|' '='
411 
412                         if ( ( pos + 1 ) < wikiLength )
413                         {
414                             char nc = wikiChars [pos + 1];
415 
416                             if ( ( nc == '>' ) || ( nc == ':' ) || ( nc == '-' ) || ( nc == '|' ) || ( nc == '=' ) || ( nc == '!' ) )
417                             { // can't be inline markup
418                                 pos++; // skip '~' and proceed to regular paragraph handling
419                                 c = nc;
420                             }
421                             else
422                                 if ( ( nc == '*' ) || ( nc == '#' ) )
423                                 { // might be inline markup so need to double check
424 
425                                     char nnc = ( ( pos + 2 ) < wikiLength ) ? wikiChars [pos + 2] : 0;
426 
427                                     if ( nnc != nc )
428                                     {
429                                         pos++; // skip '~' and proceed to regular paragraph handling
430                                         c = nc;
431                                     }
432 
433                                     // otherwise escaping will be done at line level
434                                 }
435                                 else
436                                     if ( nc == '{' )
437                                     { // might be inline {{{ markup so need to double check
438 
439                                         char nnc = ( ( pos + 2 ) < wikiLength ) ? wikiChars [pos + 2] : 0;
440 
441                                         if ( nnc == '|' )
442                                         { // mediawiki-table?
443                                             pos++; // skip '~' and proceed to regular paragraph handling
444                                             c = nc;
445                                         }
446 
447                                         // otherwise escaping will be done at line level
448                                     }
449                         }
450                     }
451 
452         sb.append( "<p>" );
453         pos = parseItem( pos, null, ContextType.PARAGRAPH );
454         sb.append( "</p>\n" );
455 
456         return true;
457     }
458 
459     /**
460      * Finds first closing '}}}' for nowiki block or span. Skips escaped sequences: '~}}}'.
461      *
462      * @param startBlock
463      *            points to first char after '{{{'
464      * @return position of first '}' in closing '}}}'
465      */
466     private int findEndOfNowiki( int startBlock )
467     {
468         // NOTE: this method could step back one char from startBlock position
469         int endBlock = startBlock - 3;
470 
471         do
472         {
473             endBlock = wikiText.indexOf( "}}}", endBlock + 3 );
474 
475             if ( endBlock < 0 )
476             {
477                 return wikiLength; // no matching '}}}' found
478             }
479 
480             while ( ( ( endBlock + 3 ) < wikiLength ) && ( wikiChars [endBlock + 3] == '}' ) )
481                 endBlock++; // shift to end of sequence of more than 3x'}' (eg. '}}}}}')
482         }
483         while ( wikiChars [endBlock - 1] == '~' );
484 
485         return endBlock;
486     }
487 
488     /**
489      * Greedy version of findEndOfNowiki(). It finds the last possible closing '}}}' before next opening '{{{'. Also uses escapes '~{{{' and '~}}}'.
490      *
491      * @param startBlock
492      *            points to first char after '{{{'
493      * @return position of first '}' in closing '}}}'
494      */
495     @SuppressWarnings( "unused" )
496     private int findEndOfNowikiGreedy( int startBlock )
497     {
498         // NOTE: this method could step back one char from startBlock position
499         int nextBlock = startBlock - 3;
500 
501         do
502         {
503             do
504             {
505                 nextBlock = wikiText.indexOf( "{{{", nextBlock + 3 );
506             }
507             while ( ( nextBlock > 0 ) && ( wikiChars [nextBlock - 1] == '~' ) );
508 
509             if ( nextBlock < 0 )
510             {
511                 nextBlock = wikiLength;
512             }
513 
514             int endBlock = wikiText.lastIndexOf( "}}}", nextBlock );
515 
516             if ( ( endBlock >= startBlock ) && ( wikiChars [endBlock - 1] != '~' ) )
517             {
518                 return endBlock;
519             }
520         }
521         while ( nextBlock < wikiLength );
522 
523         return wikiLength;
524     }
525 
526     /**
527      * @param start
528      *            points to first char after pipe '|'
529      * @return
530      */
531     private int parseTableRow( int start )
532     {
533         if ( start >= wikiLength )
534         {
535             return wikiLength;
536         }
537 
538         sb.append( "<tr>" );
539 
540         boolean endOfRow = false;
541 
542         do
543         {
544             int colspan = 0;
545 
546             while ( ( ( start + colspan ) < wikiLength ) && ( wikiChars [start + colspan] == '|' ) )
547                 colspan++;
548 
549             start += colspan;
550             colspan++;
551 
552             boolean th = ( start < wikiLength ) && ( wikiChars [start] == '=' );
553             start += ( th ? 1 : 0 );
554 
555             while ( ( start < wikiLength ) && ( wikiChars [start] <= ' ' ) && ( wikiChars [start] != '\n' ) )
556                 start++; // trim whitespace from the start
557 
558             if ( ( start >= wikiLength ) || ( wikiChars [start] == '\n' ) )
559             { // skip last empty column
560                 start++; // eat '\n'
561 
562                 break;
563             }
564 
565             sb.append( th ? "<th" : "<td" );
566 
567             if ( colspan > 1 )
568             {
569                 sb.append( " colspan=\"" + colspan + "\"" );
570             }
571 
572             sb.append( '>' );
573 
574             try
575             {
576                 parseItemThrow( start, null, ContextType.TABLE_CELL );
577             }
578             catch( EndOfSubContextException e )
579             { // end of cell
580                 start = e.position;
581 
582                 if ( start >= wikiLength )
583                 {
584                     endOfRow = true;
585                 }
586                 else
587                     if ( wikiChars [start] == '\n' )
588                     {
589                         start++; // eat '\n'
590                         endOfRow = true;
591                     }
592             }
593             catch( EndOfContextException e )
594             {
595                 start = e.position;
596                 endOfRow = true;
597             }
598 
599             sb.append( th ? "</th>" : "</td>" );
600         }
601         while ( !endOfRow /* && start<wikiLength && wikiChars[start]!='\n' */);
602 
603         sb.append( "</tr>\n" );
604 
605         return start;
606     }
607 
608     /**
609      * Same as parseItem(); blank line adds &lt;br/&gt;&lt;br/&gt;
610      *
611      * @param start
612      */
613     private int parseListItem( int start )
614     {
615         while ( ( start < wikiLength ) && ( wikiChars [start] <= ' ' ) && ( wikiChars [start] != '\n' ) )
616             start++; // skip spaces
617 
618         int end = parseItem( start, null, ContextType.LIST_ITEM );
619 
620         if ( ( ( listLevels [listLevel] == '>' ) || ( listLevels [listLevel] == ':' ) ) && ( wikiText.substring( start, end ).trim( ).length( ) == 0 ) )
621         { // empty line within blockquote/div
622 
623             if ( !blockquoteBR )
624             {
625                 sb.append( "<br/><br/>" );
626                 blockquoteBR = true;
627             }
628         }
629         else
630         {
631             blockquoteBR = false;
632         }
633 
634         return end;
635     }
636 
637     /**
638      * @param p
639      *            points to first slash in suspected URI (scheme://etc)
640      * @param start
641      *            points to beginning of parsed item
642      * @param end
643      *            points to end of parsed item
644      *
645      * @return array of two integer offsets [begin_uri, end_uri] if matched, null otherwise
646      */
647     private int [ ] checkURI( int p, int start, int end )
648     {
649         if ( ( p > start ) && ( wikiChars [p - 1] == ':' ) )
650         { // "://" found
651 
652             int pb = p - 1;
653 
654             while ( ( pb > start ) && isLatinLetterOrDigit( wikiChars [pb - 1] ) )
655                 pb--;
656 
657             int pe = p + 2;
658 
659             while ( ( pe < end ) && isUrlChar( wikiChars [pe] ) )
660                 pe++;
661 
662             URI uri = null;
663 
664             do
665             {
666                 while ( ( pe > ( p + 2 ) ) && ( ",.;:?!%)".indexOf( wikiChars [pe - 1] ) >= 0 ) )
667                     pe--; // don't want these chars at the end of URI
668 
669                 try
670                 { // verify URL syntax
671                     uri = new URI( wikiText.substring( pb, pe ) );
672                 }
673                 catch( URISyntaxException e )
674                 {
675                     pe--; // try chopping from the end
676                 }
677             }
678             while ( ( uri == null ) && ( pe > ( p + 2 ) ) );
679 
680             if ( ( uri != null ) && uri.isAbsolute( ) && !uri.isOpaque( ) )
681             {
682                 int [ ] offs = {
683                         pb, pe
684                 };
685 
686                 return offs;
687             }
688         }
689 
690         return null;
691     }
692 
693     private int parseItem( int start, String delimiter, ContextType context )
694     {
695         try
696         {
697             return parseItemThrow( start, delimiter, context );
698         }
699         catch( EndOfContextException e )
700         {
701             return e.position;
702         }
703     }
704 
705     private int parseItemThrow( int start, String delimiter, ContextType context ) throws EndOfContextException
706     {
707         StringBuilder tb = new StringBuilder( );
708 
709         boolean specialCaseDelimiterHandling = "//".equals( delimiter );
710         int p = start;
711         int end = wikiLength;
712 
713         try
714         {
715             nextChar: while ( true )
716             {
717                 if ( p >= end )
718                 {
719                     throw new EndOfContextException( end ); // break;
720                 }
721 
722                 if ( ( delimiter != null ) && wikiText.startsWith( delimiter, p ) )
723                 {
724                     if ( !specialCaseDelimiterHandling || ( checkURI( p, start, end ) == null ) )
725                     {
726                         p += delimiter.length( );
727 
728                         return p;
729                     }
730                 }
731 
732                 char c = wikiChars [p];
733                 boolean atLineStart = false;
734 
735                 // context-defined break test
736                 if ( c == '\n' )
737                 {
738                     if ( ( context == ContextType.HEADER ) || ( context == ContextType.TABLE_CELL ) )
739                     {
740                         p++;
741                         throw new EndOfContextException( p );
742                     }
743 
744                     if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == '\n' ) )
745                     { // blank line delimits everything
746                         p++; // eat one '\n' and leave another one unparsed so parseBlock() can close all lists
747                         throw new EndOfContextException( p );
748                     }
749 
750                     for ( p++; ( p < end ) && ( wikiChars [p] <= ' ' ) && ( wikiChars [p] != '\n' ); p++ )
751                         ; // skip whitespace
752 
753                     if ( p >= end )
754                     {
755                         throw new EndOfContextException( p ); // end of text reached
756                     }
757 
758                     c = wikiChars [p];
759                     atLineStart = true;
760 
761                     if ( ( c == '-' ) && wikiText.startsWith( "----", p ) )
762                     { // check for ---- <hr>
763 
764                         int pp;
765 
766                         for ( pp = p + 4; ( pp < end ) && ( ( wikiChars [pp] == ' ' ) || ( wikiChars [pp] == '\t' ) ); pp++ )
767                             ; // skip spaces
768 
769                         if ( ( pp == end ) || ( wikiChars [pp] == '\n' ) )
770                         {
771                             throw new EndOfContextException( p ); // yes, it's <hr>
772                         }
773                     }
774 
775                     if ( LIST_CHARS.indexOf( c ) >= 0 )
776                     { // start of list item?
777 
778                         if ( FORMAT_CHARS.indexOf( c ) < 0 )
779                         {
780                             throw new EndOfContextException( p );
781                         }
782 
783                         // here we have a list char, which also happen to be a format char
784                         if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] != c ) )
785                         {
786                             throw new EndOfContextException( p ); // format chars go in pairs
787                         }
788 
789                         if ( /* context==ContextType.LIST_ITEM */
790                         ( listLevel >= 0 ) && ( c == listLevels [0] ) )
791                         {
792                             // c matches current list's first level, so it must be new list item
793                             throw new EndOfContextException( p );
794                         }
795 
796                         // otherwise it must be just formatting sequence => no break of context
797                     }
798                     else
799                         if ( c == '=' )
800                         { // header
801                             throw new EndOfContextException( p );
802                         }
803                         else
804                             if ( c == '|' )
805                             { // table or mediawiki-table
806                                 throw new EndOfContextException( p );
807                             }
808                             else
809                                 if ( c == '{' )
810                                 { // mediawiki-table?
811 
812                                     if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == '|' ) )
813                                     {
814                                         int pp;
815 
816                                         for ( pp = p + 2; ( pp < end ) && ( ( wikiChars [pp] == ' ' ) || ( wikiChars [pp] == '\t' ) ); pp++ )
817                                             ; // skip spaces
818 
819                                         if ( ( pp == end ) || ( wikiChars [pp] == '\n' ) )
820                                         {
821                                             throw new EndOfContextException( p ); // yes, it's start of a table
822                                         }
823                                     }
824                                 }
825 
826                     // if none matched add '\n' to text buffer
827                     tb.append( '\n' );
828 
829                     // p and c already shifted past the '\n' and whitespace after, so go on
830                 }
831                 else
832                     if ( c == '|' )
833                     {
834                         if ( context == ContextType.TABLE_CELL )
835                         {
836                             p++;
837                             throw new EndOfSubContextException( p );
838                         }
839                     }
840 
841                 int formatType;
842 
843                 if ( c == '{' )
844                 {
845                     if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == '{' ) )
846                     {
847                         if ( ( ( p + 2 ) < end ) && ( wikiChars [p + 2] == '{' ) )
848                         { // inline or block <nowiki>
849                             appendText( tb.toString( ) );
850                             tb.delete( 0, tb.length( ) ); // flush text buffer
851 
852                             int startNowiki = p + 3;
853                             int endNowiki = findEndOfNowiki( startNowiki );
854                             p = endNowiki + 3;
855 
856                             if ( wikiText.lastIndexOf( '\n', endNowiki ) >= startNowiki )
857                             { // block <pre>
858 
859                                 if ( wikiChars [startNowiki] == '\n' )
860                                 {
861                                     startNowiki++; // skip the very first '\n'
862                                 }
863 
864                                 if ( wikiChars [endNowiki - 1] == '\n' )
865                                 {
866                                     endNowiki--; // omit the very last '\n'
867                                 }
868 
869                                 if ( context == ContextType.PARAGRAPH )
870                                 {
871                                     sb.append( "</p>" ); // break the paragraph because XHTML does not allow <pre> children of <p>
872                                 }
873 
874                                 // sb.append( "<pre>" );
875                                 appendNowiki( wikiText.substring( startNowiki, endNowiki ) );
876                                 // sb.append( "</pre>\n" );
877 
878                                 if ( context == ContextType.PARAGRAPH )
879                                 {
880                                     sb.append( "<p>" ); // continue the paragraph
881                                                         // if (context==ContextType.NOWIKI_BLOCK) return p; // in this context return immediately after nowiki
882                                 }
883                             }
884                             else
885                             { // inline <nowiki>
886                                 appendNowiki( wikiText.substring( startNowiki, endNowiki ) );
887                             }
888 
889                             continue;
890                         }
891                         else
892                             if ( ( p + 2 ) < end )
893                             { // {{image}}
894 
895                                 int endImg = wikiText.indexOf( "}}", p + 2 );
896 
897                                 if ( ( endImg >= 0 ) && ( endImg < end ) )
898                                 {
899                                     appendText( tb.toString( ) );
900                                     tb.delete( 0, tb.length( ) ); // flush text buffer
901                                     appendImage( wikiText.substring( p + 2, endImg ) );
902                                     p = endImg + 2;
903 
904                                     continue;
905                                 }
906                             }
907                     }
908                 }
909                 else
910                     if ( c == '[' )
911                     {
912                         if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == '[' ) )
913                         { // [[link]]
914 
915                             int endLink = wikiText.indexOf( "]]", p + 2 );
916 
917                             if ( ( endLink >= 0 ) && ( endLink < end ) )
918                             {
919                                 appendText( tb.toString( ) );
920                                 tb.delete( 0, tb.length( ) ); // flush text buffer
921                                 appendLink( wikiText.substring( p + 2, endLink ) );
922                                 p = endLink + 2;
923 
924                                 continue;
925                             }
926                         }
927                     }
928                     else
929                         if ( c == '\\' )
930                         {
931                             if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == '\\' ) )
932                             { // \\ = <br/>
933                                 appendText( tb.toString( ) );
934                                 tb.delete( 0, tb.length( ) ); // flush text buffer
935                                 sb.append( "<br/>" );
936                                 p += 2;
937 
938                                 continue;
939                             }
940                         }
941                         else
942                             if ( c == '<' )
943                             {
944                                 if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == '<' ) )
945                                 {
946                                     if ( ( ( p + 2 ) < end ) && ( wikiChars [p + 2] == '<' ) )
947                                     { // <<<macro>>>
948 
949                                         int endMacro = wikiText.indexOf( ">>>", p + 3 );
950 
951                                         if ( ( endMacro >= 0 ) && ( endMacro < end ) )
952                                         {
953                                             appendText( tb.toString( ) );
954                                             tb.delete( 0, tb.length( ) ); // flush text buffer
955                                             appendMacro( wikiText.substring( p + 3, endMacro ) );
956                                             p = endMacro + 3;
957 
958                                             continue;
959                                         }
960                                     }
961                                 }
962                             }
963                             else
964                                 if ( ( formatType = FORMAT_CHARS.indexOf( c ) ) >= 0 )
965                                 {
966                                     if ( ( ( p + 1 ) < end ) && ( wikiChars [p + 1] == c ) )
967                                     {
968                                         appendText( tb.toString( ) );
969                                         tb.delete( 0, tb.length( ) ); // flush text buffer
970 
971                                         if ( c == '/' )
972                                         { // special case for "//" - check if it is part of URL (scheme://etc)
973 
974                                             int [ ] uriOffs = checkURI( p, start, end );
975 
976                                             if ( uriOffs != null )
977                                             {
978                                                 int pb = uriOffs [0];
979                                                 int pe = uriOffs [1];
980 
981                                                 if ( ( pb > start ) && ( wikiChars [pb - 1] == '~' ) )
982                                                 {
983                                                     sb.delete( sb.length( ) - ( p - pb + 1 ), sb.length( ) ); // roll back URL + tilde
984                                                     sb.append( escapeHTML( wikiText.substring( pb, pe ) ) );
985                                                 }
986                                                 else
987                                                 {
988                                                     sb.delete( sb.length( ) - ( p - pb ), sb.length( ) ); // roll back URL
989                                                     appendLink( wikiText.substring( pb, pe ) );
990                                                 }
991 
992                                                 p = pe;
993 
994                                                 continue;
995                                             }
996                                         }
997 
998                                         sb.append( FORMAT_TAG_OPEN [formatType] );
999 
1000                                         try
1001                                         {
1002                                             p = parseItemThrow( p + 2, FORMAT_DELIM [formatType], context );
1003                                         }
1004                                         finally
1005                                         {
1006                                             sb.append( FORMAT_TAG_CLOSE [formatType] );
1007                                         }
1008 
1009                                         continue;
1010                                     }
1011                                 }
1012                                 else
1013                                     if ( c == '~' )
1014                                     { // escape
1015                                       // most start line escapes are dealt with in parseBlock()
1016 
1017                                         if ( atLineStart )
1018                                         {
1019                                             // same as block-level escaping: '*' '-' '#' '>' ':' '|' '='
1020                                             if ( ( p + 1 ) < end )
1021                                             {
1022                                                 char nc = wikiChars [p + 1];
1023 
1024                                                 if ( ( nc == '>' ) || ( nc == ':' ) || ( nc == '-' ) || ( nc == '|' ) || ( nc == '=' ) || ( nc == '!' ) )
1025                                                 { // can't be inline markup
1026                                                     tb.append( nc );
1027                                                     p += 2; // skip '~' and nc
1028 
1029                                                     continue nextChar;
1030                                                 }
1031                                                 else
1032                                                     if ( ( nc == '*' ) || ( nc == '#' ) )
1033                                                     { // might be inline markup so need to double check
1034 
1035                                                         char nnc = ( ( p + 2 ) < end ) ? wikiChars [p + 2] : 0;
1036 
1037                                                         if ( nnc != nc )
1038                                                         {
1039                                                             tb.append( nc );
1040                                                             p += 2; // skip '~' and nc
1041 
1042                                                             continue nextChar;
1043                                                         }
1044 
1045                                                         // otherwise escaping will be done at line level
1046                                                     }
1047                                                     else
1048                                                         if ( nc == '{' )
1049                                                         { // might be inline {{{ markup so need to double check
1050 
1051                                                             char nnc = ( ( p + 2 ) < end ) ? wikiChars [p + 2] : 0;
1052 
1053                                                             if ( nnc == '|' )
1054                                                             { // mediawiki-table?
1055                                                                 tb.append( nc );
1056                                                                 tb.append( nnc );
1057                                                                 p += 3; // skip '~', nc and nnc
1058 
1059                                                                 continue nextChar;
1060                                                             }
1061 
1062                                                             // otherwise escaping will be done as usual at line level
1063                                                         }
1064                                             }
1065                                         }
1066 
1067                                         for ( String e : ESCAPED_INLINE_SEQUENCES )
1068                                         {
1069                                             if ( wikiText.startsWith( e, p + 1 ) )
1070                                             {
1071                                                 tb.append( e );
1072                                                 p += ( 1 + e.length( ) );
1073 
1074                                                 continue nextChar;
1075                                             }
1076                                         }
1077                                     }
1078                                     else
1079                                         if ( c == '-' )
1080                                         { // ' -- ' => &ndash;
1081 
1082                                             if ( ( ( p + 2 ) < end ) && ( wikiChars [p + 1] == '-' ) && ( wikiChars [p + 2] == ' ' ) && ( p > start )
1083                                                     && ( wikiChars [p - 1] == ' ' ) )
1084                                             {
1085                                                 // appendText(tb.toString()); tb.delete(0, tb.length()); // flush text buffer
1086                                                 // sb.append("&ndash; ");
1087                                                 tb.append( "&ndash; " ); // &ndash; = "\u2013 "
1088                                                 p += 3;
1089 
1090                                                 continue;
1091                                             }
1092                                         }
1093 
1094                 tb.append( c );
1095                 p++;
1096             }
1097         }
1098         finally
1099         {
1100             appendText( tb.toString( ) );
1101             tb.delete( 0, tb.length( ) ); // flush text buffer
1102         }
1103     }
1104 
1105     protected void appendMacro( String text )
1106     {
1107         if ( "TOC".equals( text ) )
1108         {
1109             sb.append( "!!!TOC!!!" ); // put TOC placeholder for replacing it later with real TOC
1110         }
1111         else
1112         {
1113             sb.append( "&lt;&lt;&lt;Macro:" );
1114             sb.append( escapeHTML( unescapeHTML( text ) ) );
1115             sb.append( "&gt;&gt;&gt;" );
1116         }
1117     }
1118 
1119     protected void appendLink( String text )
1120     {
1121         String [ ] link = split( text, '|' );
1122         URI uri = null;
1123 
1124         try
1125         { // validate URI
1126             uri = new URI( link [0].trim( ) );
1127         }
1128         catch( URISyntaxException e )
1129         {
1130         }
1131 
1132         if ( ( uri != null ) && uri.isAbsolute( ) && !uri.isOpaque( ) )
1133         {
1134             sb.append( "<a href=\"" + escapeHTML( uri.toString( ) ) + "\" rel=\"nofollow\">" );
1135             sb.append( escapeHTML( unescapeHTML( ( ( link.length >= 2 ) && !isEmpty( link [1].trim( ) ) ) ? link [1] : link [0] ) ) );
1136             sb.append( "</a>" );
1137         }
1138         else
1139         {
1140             sb.append( "<a href=\"#\" title=\"Internal link\">" );
1141             sb.append( escapeHTML( unescapeHTML( ( ( link.length >= 2 ) && !isEmpty( link [1].trim( ) ) ) ? link [1] : link [0] ) ) );
1142             sb.append( "</a>" );
1143         }
1144     }
1145 
1146     protected void appendImage( String text )
1147     {
1148         String [ ] link = split( text, '|' );
1149         URI uri = null;
1150 
1151         try
1152         { // validate URI
1153             uri = new URI( link [0].trim( ) );
1154         }
1155         catch( URISyntaxException e )
1156         {
1157         }
1158 
1159         if ( ( uri != null ) && uri.isAbsolute( ) && !uri.isOpaque( ) )
1160         {
1161             String alt = escapeHTML( unescapeHTML( ( ( link.length >= 2 ) && !isEmpty( link [1].trim( ) ) ) ? link [1] : link [0] ) );
1162             sb.append( "<img src=\"" + escapeHTML( uri.toString( ) ) + "\" alt=\"" + alt + "\" title=\"" + alt + "\" />" );
1163         }
1164         else
1165         {
1166             sb.append( "&lt;&lt;&lt;Internal image(?): " );
1167             sb.append( escapeHTML( unescapeHTML( text ) ) );
1168             sb.append( "&gt;&gt;&gt;" );
1169         }
1170     }
1171 
1172     protected void appendText( String text )
1173     {
1174         sb.append( escapeHTML( unescapeHTML( text ) ) );
1175     }
1176 
1177     protected String generateTOCAnchorId( int hLevel, String text )
1178     {
1179         int i = 0;
1180         String id = ( ( HEADING_ID_PREFIX != null ) ? HEADING_ID_PREFIX : ( "H" + hLevel + "_" ) )
1181                 + translit( text.replaceAll( "<.+?>", "" ) ).trim( ).replaceAll( "\\s+", "_" ).replaceAll( "[^a-zA-Z0-9_-]", "" );
1182 
1183         while ( tocAnchorIds.contains( id ) )
1184         { // avoid duplicates
1185             i++;
1186             id = text + "_" + i;
1187         }
1188 
1189         tocAnchorIds.add( id );
1190 
1191         return id;
1192     }
1193 
1194     protected void appendTOCItem( int level, String anchorId, String text )
1195     {
1196         if ( level > tocLevel )
1197         {
1198             while ( level > tocLevel )
1199             {
1200                 toc.append( "<ul><li>" );
1201                 tocLevel++;
1202             }
1203         }
1204         else
1205         {
1206             while ( level < tocLevel )
1207             {
1208                 toc.append( "</li></ul>" );
1209                 tocLevel--;
1210             }
1211 
1212             toc.append( "</li>\n<li>" );
1213         }
1214 
1215         toc.append( "<a href='#page_url#" + anchorId + "'>" + text + "</a>" );
1216     }
1217 
1218     protected void completeTOC( )
1219     {
1220         while ( 0 < tocLevel )
1221         {
1222             toc.append( "</li></ul>" );
1223             tocLevel--;
1224         }
1225 
1226         int idx;
1227         String tocDiv = "<div class=\"" + _strTocClass + "\">" + toc.toString( ) + "</div>";
1228 
1229         while ( ( idx = sb.indexOf( "!!!TOC!!!" ) ) >= 0 )
1230         {
1231             sb.replace( idx, idx + 9, tocDiv );
1232         }
1233     }
1234 
1235     protected void appendNowiki( String text )
1236     {
1237         sb.append( escapeHTML( replaceString( replaceString( text, "~{{{", "{{{" ), "~}}}", "}}}" ) ) );
1238     }
1239 
1240     private static class EndOfContextException extends Exception
1241     {
1242         private static final long serialVersionUID = 1L;
1243         int position;
1244 
1245         public EndOfContextException( int position )
1246         {
1247             super( );
1248             this.position = position;
1249         }
1250     }
1251 
1252     private static class EndOfSubContextException extends EndOfContextException
1253     {
1254         private static final long serialVersionUID = 1L;
1255 
1256         public EndOfSubContextException( int position )
1257         {
1258             super( position );
1259         }
1260     }
1261 
1262     private static enum ContextType
1263     {
1264         PARAGRAPH, LIST_ITEM, TABLE_CELL, HEADER, NOWIKI_BLOCK;
1265     }
1266 }