Markup spec/OCaml

Draft lexer for the MediaWiki markup implemented in OCaml.

Other wikis have benefited from using a real lexer instead of a series of regular expressions. For example, the Lexer from Wookee engine for UseModWiki (which is unfortunately written in Perl, not PHP) could be even quite usable if html attributes parsing was added. (The Parser, however, is questionable).

Later some proof-of-concept parser could be made. It should probably generate some AST, not (X)HTML directly, so it can be used in many useful bots too. Or have 2 modes - AST for bots and HTML for maximum performance.

Idea

 * New Line
 * End of File
 * whitespace +
 * (\* | # )+ (only at linestart)
 * ={1,6} (at linestart or lineend)
 * [ [ articlenamespecification]]
 * [ [ articlenamespecification|
 * ] ]
 * urlspecification
 * [ urlspecification
 * ]
 * &lt;pre> anything_but_close_pre &lt;/pre>
 * &lt;nowiki> anything_but_close_nowiki &lt;/nowiki>
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)

Regular expressions used:
 * articlenamespecification = ???
 * entityspecification = ???
 * urlspecification = ???
 * whitespace = [\s|\t]

= Code = Mini-lexer has 4 files lexer.mll, util.ml, tokens.ml and main.ml (which is kind of trivial parser). It's incomplete:

lexer.mll
{   open Tokens open Util } let anything = ['a'-'zA'-'Z0'-'9''\\128'-'\\255']+ | ['\\000'-'\\255']

let anything_but_close_math = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'m'] | "&lt;/m" [^'a'] | "&lt;/ma" [^'t'] | "&lt;/mat" [^'h'] | "&lt;/math" [^'>'] ) +

let anything_but_close_pre = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'p'] | "&lt;/p" [^'r'] | "&lt;/pr" [^'e'] | "&lt;/pre" [^'>'] ) +

let anything_but_close_nowiki = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'n'] | "&lt;/n" [^'o'] | "&lt;/no" [^'w'] | "&lt;/now" [^'i'] | "&lt;/nowi" [^'k'] | "&lt;/nowik" [^'i'] | "&lt;/nowiki" [^'>'] ) +

let anything_but_close_comment = ( [^'-'] | '-' [^'-'] | "--" [^'>'] ) +

let whitespace = [' ''\\t'] let digit = ['0'-'9'] let hexdigit = ['0'-'9a'-'fA'-'F'] let alphanum = ['a'-'zA'-'Z0'-'9'] let alpha = ['a'-'z''A'-'Z']

let entity_named = "&amp;" alphanum + ";" let entity_dec = "&amp;#" digit + ";" let entity_hex = "&amp;#x" hexdigit + ";"

let html_space = [' \\t\\r''\\n'] let html_space_opt = html_space *

let html_attr_unquoted = ['a'-'z' 'A'-'Z' '0'-'9' '_' ',' ':' '-'] + let html_attr_arg = ('\\ [^'\\] * '\\'' | '"' [^'"'] * '"' | html_attr_unquoted) let html_attr_name = alpha + let html_attr  = html_attr_name html_space * "=" html_space * html_attr_arg let html_attrs = (html_space + html_attr) *

let html_tag_name = alpha alphanum *

let html_opening_tag = "&lt;" html_tag_name html_attrs html_space_opt ">" let html_closing_tag = "&lt;/" html_tag_name html_space_opt ">" let html_closed_tag = "&lt;"  html_tag_name html_space_opt "/>"

let articlename = [' -a'-'zA'-'Z0'-'9:_+,. {}''\\128'-'\\255']+

rule token = parse '\\n' { NL } | '\\r' { token lexbuf } | whitespace + { SP (Lexing.lexeme lexbuf) } | ['#''*'] +     { LIST (Lexing.lexeme lexbuf) } | "=" +     { EQ (String.length (Lexing.lexeme lexbuf)) } | '\\ '\\ +     { Q (String.length (Lexing.lexeme lexbuf)) } | "" '-' *     { HR } | "&lt;pre>" anything_but_close_pre "&lt;/pre>" { PRE (string_brange (Lexing.lexeme lexbuf) 5 6) } | "&lt;nowiki>" anything_but_close_nowiki "&lt;/nowiki>" { NOWIKI (string_brange (Lexing.lexeme lexbuf) 8 9) } | "&lt;math>" anything_but_close_math "&lt;/math>" { MATH (string_brange (Lexing.lexeme lexbuf) 6 7) } | "&lt;!--" anything_but_close_comment "-->" { token lexbuf } | ""     { LINK (string_brange (Lexing.lexeme lexbuf) 2 1) }  | "[[de:" articlename ""      { LINK_INTERWIKI ("de", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("en", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("eo", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("fr", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_INTERWIKI ("pl", string_brange (Lexing.lexeme lexbuf) 5 2) }  | "" articlename ""      { LINK_DEFAULT (string_brange (Lexing.lexeme lexbuf) 2 2) }  | "]]" { LINK_CLOSE } | "August" { LEAF VAR_CURRENTMONTH } | "27"     { LEAF VAR_CURRENTDAY } | "2024"     { LEAF VAR_CURRENTYEAR } | "Tuesday" { LEAF VAR_CURRENTDAYNAME } | ""     { LEAF VAR_CURRENTTIME } | ""     { LEAF VAR_NUMBEROFARTICLES } | ":"     { COLON } | ";"     { SEMI } | " "      { LEAF T3 } | " ~ "     { LEAF T4 } | "RFC" " " ? digit + { LEAF (RFC) } | "ISBN" " " ? ['0'-'9X-'] + { LEAF (ISBN) } | entity_dec { LEAF (ENT_DEC (Lexing.lexeme lexbuf)) } | entity_hex { LEAF (ENT_HEX (Lexing.lexeme lexbuf)) } | entity_named { LEAF (ENT_NAMED (Lexing.lexeme lexbuf)) } | html_opening_tag { parse_html_opening_tag (Lexing.lexeme lexbuf) } | html_closing_tag { parse_html_closing_tag (Lexing.lexeme lexbuf) } | html_closed_tag { parse_html_closed_tag (Lexing.lexeme lexbuf) } | anything { LEAF (LIT (Lexing.lexeme lexbuf)) } | eof { EOF } (*   urls and [urls]    HTML and entities - of course there should be parsing and validation here    articlename - needs to parse variables inside, needs to check what        is allowed and what is not    some unicode magic ?    lexeme_length    complete literal match accelerator    interwiki magic *)