Markup spec/OCaml

How to lex Wikipedia syntax - a draft.

'''This page has not been significantly edited since 16:39, 30 Aug 2003. Its continuing relevancy to Wikipedia is questionable.'''

Have a look at http://code.pediapress.com/ for a feature complete lexer/parser.

Other wikis have benefited from using a real lexer instead of a series of regular expressions. For example, the Lexer from Wookee engine for UseModWiki (which is unfortunately written in Perl, not PHP) could be even quite usable if html attributes parsing was added. (The Parser, however, is questionable).

Later some proof-of-concept parser could be made. It should probably generate some AST, not (X)HTML directly, so it can be used in many useful bots too. Or have 2 modes - AST for bots and HTML for maximum performance ;-). But let's better not go too deep into performance issues now. What's interesting is whether it's possible to make a parser like that.

BTW, someone should fix backslash handling in current parser. Taw 03:28 3 Aug 2003 (UTC)

This page has some commentary about why this might be tough for Wikis. Is this relevant? I don't know enough to judge for myself. Matthewsim 21:16, 20 Aug 2004 (UTC)
 * My understanding of it is that it brings up exactly the problems I've encountered. Consider  This is my dogs bone  vs.  This is my dog's bone .  Wikitax is not context-free. I would suspect resolving this would require having separate start and end tags.  Anthony DiPierro 14:13, 17 Nov 2004 (UTC)


 * Yes. Replacing ticks with pipes, at the end of a document  dog|||s displays as dogs, while dog|||s|| parses as dog|s.  So the lexing of a token can look arbitrarily far in the future, across other tokens. en:User:ciphergoth 82.70.194.38 17:14, 24 September 2006 (UTC)

Idea

 * New Line
 * End of File
 * whitespace +
 * (\* | # )+ (only at linestart)
 * ={1,6} (at linestart or lineend)
 * [ [ articlenamespecification]]
 * [ [ articlenamespecification|
 * ] ]
 * urlspecification
 * [ urlspecification
 * ]
 * &lt;pre> anything_but_close_pre &lt;/pre>
 * &lt;nowiki> anything_but_close_nowiki &lt;/nowiki>
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)

Regular expressions used:
 * articlenamespecification = ???
 * entityspecification = ???
 * urlspecification = ???
 * whitespace = [\s|\t]

= Code = Mini-lexer has 4 files lexer.mll, util.ml, tokens.ml and main.ml (which is kind of trivial parser). It's incomplete:

lexer.mll
{   open Tokens open Util } let anything = ['a'-'zA'-'Z0'-'9''\\128'-'\\255']+ | ['\\000'-'\\255']

let anything_but_close_math = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'m'] | "&lt;/m" [^'a'] | "&lt;/ma" [^'t'] | "&lt;/mat" [^'h'] | "&lt;/math" [^'>'] ) +

let anything_but_close_pre = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'p'] | "&lt;/p" [^'r'] | "&lt;/pr" [^'e'] | "&lt;/pre" [^'>'] ) +

let anything_but_close_nowiki = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'n'] | "&lt;/n" [^'o'] | "&lt;/no" [^'w'] | "&lt;/now" [^'i'] | "&lt;/nowi" [^'k'] | "&lt;/nowik" [^'i'] | "&lt;/nowiki" [^'>'] ) +

let anything_but_close_comment = ( [^'-'] | '-' [^'-'] | "--" [^'>'] ) +

let whitespace = [' ''\\t'] let digit = ['0'-'9'] let hexdigit = ['0'-'9a'-'fA'-'F'] let alphanum = ['a'-'zA'-'Z0'-'9'] let alpha = ['a'-'z''A'-'Z']

let entity_named = "&amp;" alphanum + ";" let entity_dec = "&amp;#" digit + ";" let entity_hex = "&amp;#x" hexdigit + ";"

let html_space = [' \\t\\r''\\n'] let html_space_opt = html_space *

let html_attr_unquoted = ['a'-'z' 'A'-'Z' '0'-'9' '_' ',' ':' '-'] + let html_attr_arg = ('\\ [^'\\] * '\\'' | '"' [^'"'] * '"' | html_attr_unquoted) let html_attr_name = alpha + let html_attr  = html_attr_name html_space * "=" html_space * html_attr_arg let html_attrs = (html_space + html_attr) *

let html_tag_name = alpha alphanum *

let html_opening_tag = "&lt;" html_tag_name html_attrs html_space_opt ">" let html_closing_tag = "&lt;/" html_tag_name html_space_opt ">" let html_closed_tag = "&lt;"  html_tag_name html_space_opt "/>"

let articlename = [' -a'-'zA'-'Z0'-'9:_+,. {}''\\128'-'\\255']+

rule token = parse '\\n' { NL } | '\\r' { token lexbuf } | whitespace + { SP (Lexing.lexeme lexbuf) } | ['#''*'] +     { LIST (Lexing.lexeme lexbuf) } | "=" +     { EQ (String.length (Lexing.lexeme lexbuf)) } | '\\ '\\ +     { Q (String.length (Lexing.lexeme lexbuf)) } | "" '-' *     { HR } | "&lt;pre>" anything_but_close_pre "&lt;/pre>" { PRE (string_brange (Lexing.lexeme lexbuf) 5 6) } | "&lt;nowiki>" anything_but_close_nowiki "&lt;/nowiki>" { NOWIKI (string_brange (Lexing.lexeme lexbuf) 8 9) } | "&lt;math>" anything_but_close_math "&lt;/math>" { MATH (string_brange (Lexing.lexeme lexbuf) 6 7) } | "&lt;!--" anything_but_close_comment "-->" { token lexbuf } | ""     { LINK (string_brange (Lexing.lexeme lexbuf) 2 1) }  | "[[de:" articlename ""      { LINK_INTERWIKI ("de", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("en", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("eo", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("fr", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("pl", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_DEFAULT (string_brange (Lexing.lexeme lexbuf) 2 2) }  | "]]" { LINK_CLOSE } | "August" { LEAF VAR_CURRENTMONTH } | "27"     { LEAF VAR_CURRENTDAY } | "2024"     { LEAF VAR_CURRENTYEAR } | "Tuesday" { LEAF VAR_CURRENTDAYNAME } | ""     { LEAF VAR_CURRENTTIME } | ""     { LEAF VAR_NUMBEROFARTICLES } | ":"     { COLON } | ";"     { SEMI } | " "      { LEAF T3 } | " ~ "     { LEAF T4 } | "RFC" " " ? digit + { LEAF (RFC) } | "ISBN" " " ? ['0'-'9X-'] + { LEAF (ISBN) } | entity_dec { LEAF (ENT_DEC (Lexing.lexeme lexbuf)) } | entity_hex { LEAF (ENT_HEX (Lexing.lexeme lexbuf)) } | entity_named { LEAF (ENT_NAMED (Lexing.lexeme lexbuf)) } | html_opening_tag { parse_html_opening_tag (Lexing.lexeme lexbuf) } | html_closing_tag { parse_html_closing_tag (Lexing.lexeme lexbuf) } | html_closed_tag { parse_html_closed_tag (Lexing.lexeme lexbuf) } | anything { LEAF (LIT (Lexing.lexeme lexbuf)) } | eof { EOF } (*   urls and [urls]    HTML and entities - of course there should be parsing and validation here    articlename - needs to parse variables inside, needs to check what        is allowed and what is not    some unicode magic ?    lexeme_length    complete literal match accelerator    interwiki magic *)

tokens.ml
type t_leaf = T3 | T4 | LIT of string | ENT_DEC of string | RFC | ISBN | ENT_HEX of string | ENT_NAMED of string | VAR_CURRENTMONTH | VAR_CURRENTDAY | VAR_CURRENTYEAR | VAR_CURRENTDAYNAME | VAR_CURRENTTIME | VAR_NUMBEROFARTICLES type t = NL | SP of string | LIST of string | EQ of int | Q of int | HR | PRE of string | NOWIKI of string | MATH of string | LINK of string | LINK_INTERWIKI of string * string | LINK_DEFAULT of string | LINK_CLOSE | COLON | SEMI | LEAF of t_leaf | O_P | O_H1 | O_H2 | O_H3 | O_H4 | O_H5 | O_H6 | C_P | C_H1 | C_H2 | C_H3 | C_H4 | C_H5 | C_H6 | O_UL | O_OL | O_LI | O_TABLE | O_TR | O_TH | O_TD | C_UL | C_OL | C_LI | C_TABLE | C_TR | C_TH | C_TD | O_B | O_I | O_EM | O_STRONG | C_B | C_I | C_EM | C_STRONG | O_U | O_BIG | O_SMALL | O_SUB | O_SUP | C_U | C_BIG | C_SMALL | C_SUB | C_SUP | O_CITE | O_CODE | O_S | O_STRIKE | O_TT | O_VAR | C_CITE | C_CODE | C_S | C_STRIKE | C_TT | C_VAR | O_DIV | O_CENTER | O_BLOCKQUOTE | O_CAPTION | C_DIV | C_CENTER | C_BLOCKQUOTE | C_CAPTION | O_RUBY | O_RT | O_RB | O_RP | O_DT | O_DD | C_RUBY | C_RT | C_RB | C_RP | C_DT | C_DD | Z_BR | Z_HR | Z_TR | Z_TH | Z_TD | EOF

let char_code_0 = 48 let char_code_9 = 57 let char_code_a = 97 let char_code_z = 122 let char_code_A = 65 let char_code_Z = 90

let find_eotn str n0 = let rec find_eotn_aux n = try let c = Char.code (String.get str n)           in if (c >= char_code_0 &amp;&amp; c &lt;= char_code_9) || (c >= char_code_a &amp;&amp; c &lt;= char_code_z) || (c >= char_code_A &amp;&amp; c &lt;= char_code_Z) then find_eotn_aux (n+1) else n       with _ -> n    in find_eotn_aux n0

let parse_html_opening_tag str = let l   = String.length str in let eotn = find_eotn str 1 in let tn  = String.lowercase (String.sub str 1 (eotn-1)) in match tn with "p"         -> O_P | "h1"        -> O_H1 | "h2"        -> O_H2 | "h3"        -> O_H3 | "h4"        -> O_H4 | "h5"        -> O_H5 | "h6"        -> O_H6 | "ul"        -> O_UL | "ol"        -> O_OL | "li"        -> O_LI | "table"     -> O_TABLE | "tr"        -> O_TR | "th"        -> O_TH | "td"        -> O_TD | "b"         -> O_B | "i"         -> O_I | "em"        -> O_EM | "strong"    -> O_STRONG | "u"         -> O_U | "big"       -> O_BIG | "small"     -> O_SMALL | "sub"       -> O_SUB | "sup"       -> O_SUP | "cite"      -> O_CITE | "code"      -> O_CODE | "s"         -> O_S | "strike"    -> O_STRIKE | "tt"        -> O_TT | "var"       -> O_VAR | "div"       -> O_DIV | "center"    -> O_CENTER | "blockquote" -> O_BLOCKQUOTE | "caption"   -> O_CAPTION | "ruby"      -> O_RUBY | "rt"        -> O_RT | "rb"        -> O_RB | "rp"        -> O_RP | "dt"        -> O_DT | "dd"        -> O_DD | _           -> LEAF (LIT str) let parse_html_closing_tag str = let l   = String.length str in let eotn = find_eotn str 2 in let tn  = String.lowercase (String.sub str 2 (eotn-2)) in match tn with "p"         -> C_P | "h1"        -> C_H1 | "h2"        -> C_H2 | "h3"        -> C_H3 | "h4"        -> C_H4 | "h5"        -> C_H5 | "h6"        -> C_H6 | "ul"        -> C_UL | "ol"        -> C_OL | "li"        -> C_LI | "table"     -> C_TABLE | "tr"        -> C_TR | "th"        -> C_TH | "td"        -> C_TD | "b"         -> C_B | "i"         -> C_I | "em"        -> C_EM | "strong"    -> C_STRONG | "u"         -> C_U | "big"       -> C_BIG | "small"     -> C_SMALL | "sub"       -> C_SUB | "sup"       -> C_SUP | "cite"      -> C_CITE | "code"      -> C_CODE | "s"         -> C_S | "strike"    -> C_STRIKE | "tt"        -> C_TT | "var"       -> C_VAR | "div"       -> C_DIV | "center"    -> C_CENTER | "blockquote" -> C_BLOCKQUOTE | "caption"   -> C_CAPTION | "ruby"      -> C_RUBY | "rt"        -> C_RT | "rb"        -> C_RB | "rp"        -> C_RP | "dt"        -> C_DT | "dd"        -> C_DD | _           -> LEAF (LIT str) let parse_html_closed_tag str = let l   = String.length str in let eotn = find_eotn str 1 in let tn  = String.lowercase (String.sub str 1 (eotn-1)) in match tn with "br"       -> Z_BR (* validate that attrs is empty *) | "hr"       -> Z_HR (* validate that attrs is empty *) | "tr"       -> Z_TR (* validate attrs *) | "th"       -> Z_TH (* validate attrs *) | "td"       -> Z_TD (* validate attrs *) | _          -> LEAF (LIT str)

util.ml
let string_brange str s e = let n = String.length str in String.sub str s (n - s - e)

main.ml
open Tokens open Printf

let string_of_token = function LEAF (LIT c)	 -> "lit " ^ c | LEAF VAR_CURRENTMONTH     -> "August" | LEAF VAR_CURRENTDAY      -> "27" | LEAF VAR_CURRENTYEAR     -> "2024" | LEAF VAR_CURRENTDAYNAME  -> "" | LEAF VAR_CURRENTTIME     -> "" | LEAF VAR_NUMBEROFARTICLES -> "" | LEAF (ENT_DEC s)	 -> "&amp;dec; " ^ s | LEAF (ENT_HEX s)	 -> "&amp;hex;" ^ s  | LEAF (ENT_NAMED s)	 -> "&amp;named;" ^ s  | NL       		 -> "\\n" | SP _    		 -> "sp" | EOF     		 -> "eof" | LIST s  		 -> "list " ^ s  | EQ i     		 -> "eq " ^ (string_of_int i)  | Q i      		 -> "q " ^ (string_of_int i)  | HR       		 -> "" | PRE _   		 -> "&lt;pre>?&lt;/pre>" | NOWIKI _		 -> "&lt;nowiki>?&lt;/nowiki>" | MATH _  		 -> "&lt;math>?&lt;/math>" | LINK_INTERWIKI (w,s) -> "" ^ w ^ ":" ^ s ^ "" | LINK s		 -> "" | LINK_DEFAULT s	 -> "[[" ^ s ^ ""  | LINK_CLOSE           -> "]]" | COLON		 -> ":" | SEMI		 -> ";" | LEAF T3		 -> " " | LEAF T4		 -> " ~ " | Z_BR		 -> "&lt;br/>" | Z_HR		 -> "&lt;hr/>" | Z_TR		 -> "&lt;tr/>" | Z_TD		 -> "&lt;td/>" | Z_TH		 -> "&lt;th/>" | O_B     		 -> "&lt;b>" | O_I     		 -> "&lt;i>" | O_P     		 -> "&lt;p>" | O_U     		 -> "&lt;u>" | O_S     		 -> "&lt;s>" | O_H1		 -> "&lt;h1>" | O_H2		 -> "&lt;h2>" | O_H3		 -> "&lt;h3>" | O_H4		 -> "&lt;h4>" | O_H5		 -> "&lt;h5>" | O_H6		 -> "&lt;h6>" | C_B     		 -> "&lt;/b>" | C_I     		 -> "&lt;/i>" | C_P     		 -> "&lt;/p>" | C_U     		 -> "&lt;/u>" | C_S     		 -> "&lt;/s>" | C_H1		 -> "&lt;/h1>" | C_H2		 -> "&lt;/h2>" | C_H3		 -> "&lt;/h3>" | C_H4		 -> "&lt;/h4>" | C_H5		 -> "&lt;/h5>" | C_H6		 -> "&lt;/h6>" | LEAF (RFC)		 -> "rfc" | LEAF (ISBN)		 -> "isbn" | O_UL		 -> "&lt;ul>" | O_OL		 -> "&lt;ol>" | O_LI		 -> "&lt;li>" | O_TR		 -> "&lt;tr>" | O_TH		 -> "&lt;th>" | O_TD		 -> "&lt;td>" | O_EM		 -> "&lt;em>" | O_TT		 -> "&lt;tt>" | O_RT		 -> "&lt;rt>" | O_RB		 -> "&lt;rb>" | O_RP		 -> "&lt;rp>" | O_DD		 -> "&lt;dd>" | O_DT		 -> "&lt;dl>" | C_UL		 -> "&lt;/ul>" | C_OL		 -> "&lt;/ol>" | C_LI		 -> "&lt;/li>" | C_TR		 -> "&lt;/tr>" | C_TH		 -> "&lt;/th>" | C_TD		 -> "&lt;/td>" | C_EM		 -> "&lt;/em>" | C_TT		 -> "&lt;/tt>" | C_RT		 -> "&lt;/rt>" | C_RB		 -> "&lt;/rb>" | C_RP		 -> "&lt;/rp>" | C_DD		 -> "&lt;/dd>" | C_DT		 -> "&lt;/dl>" | O_BIG		 -> "&lt;big>" | O_SUP		 -> "&lt;sup>" | O_SUB		 -> "&lt;sub>" | O_VAR		 -> "&lt;var>" | O_DIV		 -> "&lt;div>" | C_BIG		 -> "&lt;/big>" | C_SUP		 -> "&lt;/sup>" | C_SUB		 -> "&lt;/sub>" | C_VAR		 -> "&lt;/var>" | C_DIV		 -> "&lt;/div>" | O_CODE		 -> "&lt;code>" | O_CITE		 -> "&lt;cite>" | O_RUBY		 -> "&lt;ruby>" | C_CODE		 -> "&lt;/code>" | C_CITE		 -> "&lt;/cite>" | C_RUBY		 -> "&lt;/ruby>" | O_SMALL		 -> "&lt;small>" | O_STRIKE		 -> "&lt;strike>" | O_STRONG		 -> "&lt;strong>" | O_CENTER		 -> "&lt;center>" | O_CAPTION		 -> "&lt;caption>" | O_BLOCKQUOTE	 -> "&lt;blockquote>" | C_SMALL		 -> "&lt;/small>" | C_STRIKE		 -> "&lt;/strike>" | C_STRONG		 -> "&lt;/strong>" | C_CENTER		 -> "&lt;/center>" | C_CAPTION		 -> "&lt;/caption>" | C_BLOCKQUOTE	 -> "&lt;/blockquote>" | O_TABLE		 -> "&lt;table>" | C_TABLE		 -> "&lt;/table>"

let lexbuf = Lexing.from_channel stdin

(* let rec print_lexemes =    let t = Lexer.token lexbuf in	print_string (string_of_token t ^ "\\n");	if t &lt;> EOF	    then print_lexemes

let _ = print_lexemes let get_token = Lexer.token lexbuf

let output_header_interwiki (i,a) = printf "Header interwiki: %s:%s\\n" i a let output_leaf = function LIT l      	   -> printf "Lit: %s\\n" l    | T3 	  	   -> printf "  \\n" | T4 	 	   -> printf " ~ \\n" | ENT_DEC e  	   -> printf "Ent dec: %s\\n" e    | RFC         	   -> printf "RFC\\n" | ISBN       	   -> printf "ISBN\\n" | ENT_HEX e  	   -> printf "Ent hex: %s\\n" e    | ENT_NAMED e 	   -> printf "Ent nam: %s\\n" e    | VAR_CURRENTMONTH     -> printf "August" | VAR_CURRENTDAY      -> printf "27" | VAR_CURRENTYEAR     -> printf "2024" | VAR_CURRENTDAYNAME  -> printf "" | VAR_CURRENTTIME     -> printf "" | VAR_NUMBEROFARTICLES -> printf "" let output_convert = function SP s       -> printf "sp %s\\n" s    | COLON       -> printf ":\\n" | SEMI	 -> printf ";\\n" | EQ i	 -> printf "= * %d\\n" i    | _           -> failwith "wrong convert" type line_type =  LineNormal | LinePre of string | LineEQTry of int | LineList of string

let rec parse_header sp t = match sp,t with Some s,SP s' 	        -> parse_header (Some (s^s')) (get_token ) | None,SP s             -> parse_header (Some s) (get_token ) | _,NL 		        -> parse_header None (get_token ) | _,LINK_INTERWIKI (i,a) -> output_header_interwiki (i,a); parse_header None (get_token ) | None,_                -> parse_line_start t	| Some s,_               -> parse_line_cnt (LinePre s) t and parse_line_start t = match t with SP s  -> parse_line_cnt (LinePre s) (get_token ) | LIST s -> parse_line_cnt (LineList s) (get_token ) | EQ s  -> parse_line_cnt (LineEQTry s) (get_token ) | NL    -> parse_line_start (get_token ) | _     -> parse_line_cnt LineNormal t and parse_line_cnt ltyp t = match t with LEAF l -> output_leaf l; parse_line_cnt ltyp (get_token ) | LIST _ -> output_convert t; parse_line_cnt ltyp (get_token ) | COLON -> output_convert t; parse_line_cnt ltyp (get_token ) | SEMI  -> output_convert t; parse_line_cnt ltyp (get_token ) | SP _  -> ((match ltyp with LinePre _ -> output_convert t		   | _         -> output_leaf (LIT " ") ); parse_line_cnt ltyp (get_token )) | EQ i  -> ((match ltyp with LineEQTry j when i = j -> parse_line_cnt_tryeqf i (get_token ) | _			     -> output_convert t; parse_line_cnt ltyp (get_token ) ); parse_line_cnt ltyp (get_token )) and parse_line_cnt_tryeqf i t = match t with SP _ -> parse_line_cnt_tryeqf i (get_token ) | NL  -> (* a real header line !!! *) parse_line_start (get_token ) | _   -> output_convert (EQ i); parse_line_cnt (LineEQTry i) t let _ = parse_header None (get_token )