Markup spec/OCaml

How to lex Wikipedia syntax - a draft.

What Lexer is about - Could someone please answer this? What for is Lexer?


 * I keep telling you guys to look at http://wiki.beyondunreal.com/wiki/Wookee ! We could use it or rewrite it in PHP. -- Tarquin 16:39, 30 Aug 2003 (UTC)

Lexer could be even quite usable if html attributes parsing was added. Parser is of course a joke and probably doesn't even compile. Later some proof-of-concept parser could be made. It should probably generate some AST, not (X)HTML directly, so it can be used in many useful bots too. Or have 2 modes - AST for bots and HTML for maximum performance ;-). But let's better not go too deep into performance issues now. What's interesting is whether it's possible to make a parser like that.

BTW, someone should fix backslash handling in current parser. Taw 03:28 3 Aug 2003 (UTC)

= Idea =


 * New Line
 * End of File
 * whitespace +
 * (\* | # )+ (only at linestart)
 * ={1,6} (at linestart or lineend)
 * [ [ articlenamespecification]]
 * [ [ articlenamespecification|
 * ] ]
 * urlspecification
 * [ urlspecification
 * ]
 * &lt;pre> anything_but_close_pre &lt;/pre>
 * &lt;nowiki> anything_but_close_nowiki &lt;/nowiki>
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)

Regular expressions used:
 * articlenamespecification = ???
 * entityspecification = ???
 * urlspecification = ???
 * whitespace = [\s|\t]

= Code = Mini-lexer has 4 files lexer.mll, util.ml, tokens.ml and main.ml (which is kind of trivial parser). It's incomplete:

lexer.mll
{   open Tokens open Util } let anything = ['a'-'zA'-'Z0'-'9''\\128'-'\\255']+ | ['\\000'-'\\255']

let anything_but_close_math = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'m'] | "&lt;/m" [^'a'] | "&lt;/ma" [^'t'] | "&lt;/mat" [^'h'] | "&lt;/math" [^'>'] ) +

let anything_but_close_pre = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'p'] | "&lt;/p" [^'r'] | "&lt;/pr" [^'e'] | "&lt;/pre" [^'>'] ) +

let anything_but_close_nowiki = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'n'] | "&lt;/n" [^'o'] | "&lt;/no" [^'w'] | "&lt;/now" [^'i'] | "&lt;/nowi" [^'k'] | "&lt;/nowik" [^'i'] | "&lt;/nowiki" [^'>'] ) +

let anything_but_close_comment = ( [^'-'] | '-' [^'-'] | "--" [^'>'] ) +

let whitespace = [' ''\\t'] let digit = ['0'-'9'] let hexdigit = ['0'-'9a'-'fA'-'F'] let alphanum = ['a'-'zA'-'Z0'-'9'] let alpha = ['a'-'z''A'-'Z']

let entity_named = "&amp;" alphanum + ";" let entity_dec = "&amp;#" digit + ";" let entity_hex = "&amp;#x" hexdigit + ";"

let html_space = [' \\t\\r''\\n'] let html_space_opt = html_space *

let html_attr_unquoted = ['a'-'z' 'A'-'Z' '0'-'9' '_' ',' ':' '-'] + let html_attr_arg = ('\\ [^'\\] * '\\'' | '"' [^'"'] * '"' | html_attr_unquoted) let html_attr_name = alpha + let html_attr  = html_attr_name html_space * "=" html_space * html_attr_arg let html_attrs = (html_space + html_attr) *

let html_tag_name = alpha alphanum *

let html_opening_tag = "&lt;" html_tag_name html_attrs html_space_opt ">" let html_closing_tag = "&lt;/" html_tag_name html_space_opt ">" let html_closed_tag = "&lt;"  html_tag_name html_space_opt "/>"

let articlename = [' -a'-'zA'-'Z0'-'9:_+,. {}''\\128'-'\\255']+

rule token = parse '\\n' { NL } | '\\r' { token lexbuf } | whitespace + { SP (Lexing.lexeme lexbuf) } | ['#''*'] +     { LIST (Lexing.lexeme lexbuf) } | "=" +     { EQ (String.length (Lexing.lexeme lexbuf)) } | '\\ '\\ +     { Q (String.length (Lexing.lexeme lexbuf)) } | "" '-' *     { HR } | "&lt;pre>" anything_but_close_pre "&lt;/pre>" { PRE (string_brange (Lexing.lexeme lexbuf) 5 6) } | "&lt;nowiki>" anything_but_close_nowiki "&lt;/nowiki>" { NOWIKI (string_brange (Lexing.lexeme lexbuf) 8 9) } | "&lt;math>" anything_but_close_math "&lt;/math>" { MATH (string_brange (Lexing.lexeme lexbuf) 6 7) } | "&lt;!--" anything_but_close_comment "-->" { token lexbuf } | ""     { LINK (string_brange (Lexing.lexeme lexbuf) 2 1) }  | "[[de:" articlename ""      { LINK_INTERWIKI ("de", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("en", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("eo", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("fr", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("pl", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_DEFAULT (string_brange (Lexing.lexeme lexbuf) 2 2) }  | "]]" { LINK_CLOSE } | "August" { LEAF VAR_CURRENTMONTH } | "27"     { LEAF VAR_CURRENTDAY } | "2024"     { LEAF VAR_CURRENTYEAR } | "Tuesday" { LEAF VAR_CURRENTDAYNAME } | ""     { LEAF VAR_CURRENTTIME } | ""     { LEAF VAR_NUMBEROFARTICLES } | ":"     { COLON } | ";"     { SEMI } | " "      { LEAF T3 } | " ~ "     { LEAF T4 } | "RFC" " " ? digit + { LEAF (RFC) } | "ISBN" " " ? ['0'-'9X-'] + { LEAF (ISBN) } | entity_dec { LEAF (ENT_DEC (Lexing.lexeme lexbuf)) } | entity_hex { LEAF (ENT_HEX (Lexing.lexeme lexbuf)) } | entity_named { LEAF (ENT_NAMED (Lexing.lexeme lexbuf)) } | html_opening_tag { parse_html_opening_tag (Lexing.lexeme lexbuf) } | html_closing_tag { parse_html_closing_tag (Lexing.lexeme lexbuf) } | html_closed_tag { parse_html_closed_tag (Lexing.lexeme lexbuf) } | anything { LEAF (LIT (Lexing.lexeme lexbuf)) } | eof { EOF } (*   urls and [urls]    HTML and entities - of course there should be parsing and validation here    articlename - needs to parse variables inside, needs to check what        is allowed and what is not    some unicode magic ?    lexeme_length    complete literal match accelerator    interwiki magic *)

tokens.ml
type t_leaf = T3 | T4 | LIT of string | ENT_DEC of string | RFC | ISBN | ENT_HEX of string | ENT_NAMED of string | VAR_CURRENTMONTH | VAR_CURRENTDAY | VAR_CURRENTYEAR | VAR_CURRENTDAYNAME | VAR_CURRENTTIME | VAR_NUMBEROFARTICLES type t = NL | SP of string | LIST of string | EQ of int | Q of int | HR | PRE of string | NOWIKI of string | MATH of string | LINK of string | LINK_INTERWIKI of string * string | LINK_DEFAULT of string | LINK_CLOSE | COLON | SEMI | LEAF of t_leaf | O_P | O_H1 | O_H2 | O_H3 | O_H4 | O_H5 | O_H6 | C_P | C_H1 | C_H2 | C_H3 | C_H4 | C_H5 | C_H6 | O_UL | O_OL | O_LI | O_TABLE | O_TR | O_TH | O_TD | C_UL | C_OL | C_LI | C_TABLE | C_TR | C_TH | C_TD | O_B | O_I | O_EM | O_STRONG | C_B | C_I | C_EM | C_STRONG | O_U | O_BIG | O_SMALL | O_SUB | O_SUP | C_U | C_BIG | C_SMALL | C_SUB | C_SUP | O_CITE | O_CODE | O_S | O_STRIKE | O_TT | O_VAR | C_CITE | C_CODE | C_S | C_STRIKE | C_TT | C_VAR | O_DIV | O_CENTER | O_BLOCKQUOTE | O_CAPTION | C_DIV | C_CENTER | C_BLOCKQUOTE | C_CAPTION | O_RUBY | O_RT | O_RB | O_RP | O_DT | O_DD | C_RUBY | C_RT | C_RB | C_RP | C_DT | C_DD | Z_BR | Z_HR | Z_TR | Z_TH | Z_TD | EOF

let char_code_0 = 48 let char_code_9 = 57 let char_code_a = 97 let char_code_z = 122 let char_code_A = 65 let char_code_Z = 90

let find_eotn str n0 = let rec find_eotn_aux n = try let c = Char.code (String.get str n)           in if (c >= char_code_0 &amp;&amp; c &lt;= char_code_9) || (c >= char_code_a &amp;&amp; c &lt;= char_code_z) || (c >= char_code_A &amp;&amp; c &lt;= char_code_Z) then find_eotn_aux (n+1) else n       with _ -> n    in find_eotn_aux n0

let parse_html_opening_tag str = let l   = String.length str in let eotn = find_eotn str 1 in let tn  = String.lowercase (String.sub str 1 (eotn-1)) in match tn with "p"         -> O_P | "h1"        -> O_H1 | "h2"        -> O_H2 | "h3"        -> O_H3 | "h4"        -> O_H4 | "h5"        -> O_H5 | "h6"        -> O_H6 | "ul"        -> O_UL | "ol"        -> O_OL | "li"        -> O_LI | "table"     -> O_TABLE | "tr"        -> O_TR | "th"        -> O_TH | "td"        -> O_TD | "b"         -> O_B | "i"         -> O_I | "em"        -> O_EM | "strong"    -> O_STRONG | "u"         -> O_U | "big"       -> O_BIG | "small"     -> O_SMALL | "sub"       -> O_SUB | "sup"       -> O_SUP | "cite"      -> O_CITE | "code"      -> O_CODE | "s"         -> O_S | "strike"    -> O_STRIKE | "tt"        -> O_TT | "var"       -> O_VAR | "div"       -> O_DIV | "center"    -> O_CENTER | "blockquote" -> O_BLOCKQUOTE | "caption"   -> O_CAPTION | "ruby"      -> O_RUBY | "rt"        -> O_RT | "rb"        -> O_RB | "rp"        -> O_RP | "dt"        -> O_DT | "dd"        -> O_DD | _           -> LEAF (LIT str) let parse_html_closing_tag str = let l   = String.length str in let eotn = find_eotn str 2 in let tn  = String.lowercase (String.sub str 2 (eotn-2)) in match tn with "p"         -> C_P | "h1"        -> C_H1 | "h2"        -> C_H2 | "h3"        -> C_H3 | "h4"        -> C_H4 | "h5"        -> C_H5 | "h6"        -> C_H6 | "ul"        -> C_UL | "ol"        -> C_OL | "li"        -> C_LI | "table"     -> C_TABLE | "tr"        -> C_TR | "th"        -> C_TH | "td"        -> C_TD | "b"         -> C_B | "i"         -> C_I | "em"        -> C_EM | "strong"    -> C_STRONG | "u"         -> C_U | "big"       -> C_BIG | "small"     -> C_SMALL | "sub"       -> C_SUB | "sup"       -> C_SUP | "cite"      -> C_CITE | "code"      -> C_CODE | "s"         -> C_S | "strike"    -> C_STRIKE | "tt"        -> C_TT | "var"       -> C_VAR | "div"       -> C_DIV | "center"    -> C_CENTER | "blockquote" -> C_BLOCKQUOTE | "caption"   -> C_CAPTION | "ruby"      -> C_RUBY | "rt"        -> C_RT | "rb"        -> C_RB | "rp"        -> C_RP | "dt"        -> C_DT | "dd"        -> C_DD | _           -> LEAF (LIT str) let parse_html_closed_tag str = let l   = String.length str in let eotn = find_eotn str 1 in let tn  = String.lowercase (String.sub str 1 (eotn-1)) in match tn with "br"       -> Z_BR (* validate that attrs is empty *) | "hr"       -> Z_HR (* validate that attrs is empty *) | "tr"       -> Z_TR (* validate attrs *) | "th"       -> Z_TH (* validate attrs *) | "td"       -> Z_TD (* validate attrs *) | _          -> LEAF (LIT str)

util.ml
let string_brange str s e = let n = String.length str in String.sub str s (n - s - e)

main.ml
open Tokens open Printf

let string_of_token = function LEAF (LIT c)	 -> "lit " ^ c | LEAF VAR_CURRENTMONTH     -> "August" | LEAF VAR_CURRENTDAY      -> "27" | LEAF VAR_CURRENTYEAR     -> "2024" | LEAF VAR_CURRENTDAYNAME  -> "" | LEAF VAR_CURRENTTIME     -> "" | LEAF VAR_NUMBEROFARTICLES -> "" | LEAF (ENT_DEC s)	 -> "&amp;dec; " ^ s | LEAF (ENT_HEX s)	 -> "&amp;hex;" ^ s  | LEAF (ENT_NAMED s)	 -> "&amp;named;" ^ s  | NL       		 -> "\\n" | SP _    		 -> "sp" | EOF     		 -> "eof" | LIST s  		 -> "list " ^ s  | EQ i     		 -> "eq " ^ (string_of_int i)  | Q i      		 -> "q " ^ (string_of_int i)  | HR       		 -> "" | PRE _   		 -> "&lt;pre>?&lt;/pre>" | NOWIKI _		 -> "&lt;nowiki>?&lt;/nowiki>" | MATH _  		 -> "&lt;math>?&lt;/math>" | LINK_INTERWIKI (w,s) -> "" ^ w ^ ":" ^ s ^ "" | LINK s		 -> "" | LINK_DEFAULT s	 -> "[[" ^ s ^ ""  | LINK_CLOSE           -> "]]" | COLON		 -> ":" | SEMI		 -> ";" | LEAF T3		 -> " " | LEAF T4		 -> " ~ " | Z_BR		 -> "&lt;br/>" | Z_HR		 -> "&lt;hr/>" | Z_TR		 -> "&lt;tr/>" | Z_TD		 -> "&lt;td/>" | Z_TH		 -> "&lt;th/>" | O_B     		 -> "&lt;b>" | O_I     		 -> "&lt;i>" | O_P     		 -> "&lt;p>" | O_U     		 -> "&lt;u>" | O_S     		 -> "&lt;s>" | O_H1		 -> "&lt;h1>" | O_H2		 -> "&lt;h2>" | O_H3		 -> "&lt;h3>" | O_H4		 -> "&lt;h4>" | O_H5		 -> "&lt;h5>" | O_H6		 -> "&lt;h6>" | C_B     		 -> "&lt;/b>" | C_I     		 -> "&lt;/i>" | C_P     		 -> "&lt;/p>" | C_U     		 -> "&lt;/u>" | C_S     		 -> "&lt;/s>" | C_H1		 -> "&lt;/h1>" | C_H2		 -> "&lt;/h2>" | C_H3		 -> "&lt;/h3>" | C_H4		 -> "&lt;/h4>" | C_H5		 -> "&lt;/h5>" | C_H6		 -> "&lt;/h6>" | LEAF (RFC)		 -> "rfc" | LEAF (ISBN)		 -> "isbn" | O_UL		 -> "&lt;ul>" | O_OL		 -> "&lt;ol>" | O_LI		 -> "&lt;li>" | O_TR		 -> "&lt;tr>" | O_TH		 -> "&lt;th>" | O_TD		 -> "&lt;td>" | O_EM		 -> "&lt;em>" | O_TT		 -> "&lt;tt>" | O_RT		 -> "&lt;rt>" | O_RB		 -> "&lt;rb>" | O_RP		 -> "&lt;rp>" | O_DD		 -> "&lt;dd>" | O_DT		 -> "&lt;dl>" | C_UL		 -> "&lt;/ul>" | C_OL		 -> "&lt;/ol>" | C_LI		 -> "&lt;/li>" | C_TR		 -> "&lt;/tr>" | C_TH		 -> "&lt;/th>" | C_TD		 -> "&lt;/td>" | C_EM		 -> "&lt;/em>" | C_TT		 -> "&lt;/tt>" | C_RT		 -> "&lt;/rt>" | C_RB		 -> "&lt;/rb>" | C_RP		 -> "&lt;/rp>" | C_DD		 -> "&lt;/dd>" | C_DT		 -> "&lt;/dl>" | O_BIG		 -> "&lt;big>" | O_SUP		 -> "&lt;sup>" | O_SUB		 -> "&lt;sub>" | O_VAR		 -> "&lt;var>" | O_DIV		 -> "&lt;div>" | C_BIG		 -> "&lt;/big>" | C_SUP		 -> "&lt;/sup>" | C_SUB		 -> "&lt;/sub>" | C_VAR		 -> "&lt;/var>" | C_DIV		 -> "&lt;/div>" | O_CODE		 -> "&lt;code>" | O_CITE		 -> "&lt;cite>" | O_RUBY		 -> "&lt;ruby>" | C_CODE		 -> "&lt;/code>" | C_CITE		 -> "&lt;/cite>" | C_RUBY		 -> "&lt;/ruby>" | O_SMALL		 -> "&lt;small>" | O_STRIKE		 -> "&lt;strike>" | O_STRONG		 -> "&lt;strong>" | O_CENTER		 -> "&lt;center>" | O_CAPTION		 -> "&lt;caption>" | O_BLOCKQUOTE	 -> "&lt;blockquote>" | C_SMALL		 -> "&lt;/small>" | C_STRIKE		 -> "&lt;/strike>" | C_STRONG		 -> "&lt;/strong>" | C_CENTER		 -> "&lt;/center>" | C_CAPTION		 -> "&lt;/caption>" | C_BLOCKQUOTE	 -> "&lt;/blockquote>" | O_TABLE		 -> "&lt;table>" | C_TABLE		 -> "&lt;/table>"

let lexbuf = Lexing.from_channel stdin

(* let rec print_lexemes =    let t = Lexer.token lexbuf in	print_string (string_of_token t ^ "\\n");	if t &lt;> EOF	    then print_lexemes

let _ = print_lexemes let get_token = Lexer.token lexbuf

let output_header_interwiki (i,a) = printf "Header interwiki: %s:%s\\n" i a let output_leaf = function LIT l      	   -> printf "Lit: %s\\n" l    | T3 	  	   -> printf "  \\n" | T4 	 	   -> printf " ~ \\n" | ENT_DEC e  	   -> printf "Ent dec: %s\\n" e    | RFC         	   -> printf "RFC\\n" | ISBN       	   -> printf "ISBN\\n" | ENT_HEX e  	   -> printf "Ent hex: %s\\n" e    | ENT_NAMED e 	   -> printf "Ent nam: %s\\n" e    | VAR_CURRENTMONTH     -> printf "August" | VAR_CURRENTDAY      -> printf "27" | VAR_CURRENTYEAR     -> printf "2024" | VAR_CURRENTDAYNAME  -> printf "" | VAR_CURRENTTIME     -> printf "" | VAR_NUMBEROFARTICLES -> printf "" let output_convert = function SP s       -> printf "sp %s\\n" s    | COLON       -> printf ":\\n" | SEMI	 -> printf ";\\n" | EQ i	 -> printf "= * %d\\n" i    | _           -> failwith "wrong convert" type line_type =  LineNormal | LinePre of string | LineEQTry of int | LineList of string

let rec parse_header sp t = match sp,t with Some s,SP s' 	        -> parse_header (Some (s^s')) (get_token ) | None,SP s             -> parse_header (Some s) (get_token ) | _,NL 		        -> parse_header None (get_token ) | _,LINK_INTERWIKI (i,a) -> output_header_interwiki (i,a); parse_header None (get_token ) | None,_                -> parse_line_start t	| Some s,_               -> parse_line_cnt (LinePre s) t and parse_line_start t = match t with SP s  -> parse_line_cnt (LinePre s) (get_token ) | LIST s -> parse_line_cnt (LineList s) (get_token ) | EQ s  -> parse_line_cnt (LineEQTry s) (get_token ) | NL    -> parse_line_start (get_token ) | _     -> parse_line_cnt LineNormal t and parse_line_cnt ltyp t = match t with LEAF l -> output_leaf l; parse_line_cnt ltyp (get_token ) | LIST _ -> output_convert t; parse_line_cnt ltyp (get_token ) | COLON -> output_convert t; parse_line_cnt ltyp (get_token ) | SEMI  -> output_convert t; parse_line_cnt ltyp (get_token ) | SP _  -> ((match ltyp with LinePre _ -> output_convert t		   | _         -> output_leaf (LIT " ") ); parse_line_cnt ltyp (get_token )) | EQ i  -> ((match ltyp with LineEQTry j when i = j -> parse_line_cnt_tryeqf i (get_token ) | _			     -> output_convert t; parse_line_cnt ltyp (get_token ) ); parse_line_cnt ltyp (get_token )) and parse_line_cnt_tryeqf i t = match t with SP _ -> parse_line_cnt_tryeqf i (get_token ) | NL  -> (* a real header line !!! *) parse_line_start (get_token ) | _   -> output_convert (EQ i); parse_line_cnt (LineEQTry i) t let _ = parse_header None (get_token )