Markup spec/OCaml

How to lex Wikipedia syntax - a draft.

= Idea =


 * New Line
 * End of File
 * whitespace +
 * (\* | # )+ (only at linestart)
 * ={1,6} (at linestart or lineend)
 * [ [ articlenamespecification]]
 * [ [ articlenamespecification|
 * ] ]
 * urlspecification
 * [ urlspecification
 * ]
 * &lt;pre> anything_but_close_pre &lt;/pre>
 * &lt;nowiki> anything_but_close_nowiki &lt;/nowiki>
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)
 * &lt;math> anything_but_close_math &lt;/math>
 * &lt;!-- anything_but_close_html_comment -->
 * ISBN whitespace [0-9X-]+
 * RFC whitespace \d+
 * (one token per valid HTML tag)
 * & entityspecification ;
 * anyothercharacter
 * variables (+ magic to parse "articlename" for variables to emulate current multipass parser)

Regular expressions used:
 * articlenamespecification = ???
 * entityspecification = ???
 * urlspecification = ???
 * whitespace = [\s|\t]

= Code = Mini-lexer has 2 files lexer.mll and tokens.ml. It's incomplete:

lexer.mll
{   open Tokens } let anything = ['\\000'-'\\255']

let anything_but_close_math = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'m'] | "&lt;/m" [^'a'] | "&lt;/ma" [^'t'] | "&lt;/mat" [^'h'] | "&lt;/math" [^'>'] ) +

let anything_but_close_pre = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'p'] | "&lt;/p" [^'r'] | "&lt;/pr" [^'e'] | "&lt;/pre" [^'>'] ) +

let anything_but_close_nowiki = ( [^'&lt;'] | '&lt;' [^'/'] | "&lt;/" [^'n'] | "&lt;/n" [^'o'] | "&lt;/no" [^'w'] | "&lt;/now" [^'i'] | "&lt;/nowi" [^'k'] | "&lt;/nowik" [^'i'] | "&lt;/nowiki" [^'>'] ) +

let anything_but_close_comment = ( [^'-'] | '-' [^'-'] | "--" [^'>'] ) +

let whitespace = [' ''\\t'] let digit = ['0'-'9'] let hexdigit = ['0'-'9a'-'fA'-'F'] let alphanum = ['a'-'zA'-'Z0'-'9'] let alpha = ['a'-'z''A'-'Z']

let entity_named = "&amp" alphanum + ";" let entity_dec = "&amp#" digit + ";" let entity_hex = "&amp#x" hexdigit + ";"

let html_space = [' \\t\\r''\\n'] let html_space_opt = html_space *

let html_attr_unquoted = ['a'-'z' 'A'-'Z' '0'-'9' '_' ',' ':' '-'] + let html_attr_arg = ('\\ [^'\\] * '\\'' | '"' [^'"'] * '"' | html_attr_unquoted) let html_attr_name = alpha + let html_attr = html_attr_name html_space * "=" html_space * html_attr_arg let html_attrs = (html_space + html_attr) *

let html_tag_name = alpha alphanum *

let html_opening_tag = "&lt;" html_tag_name html_attrs html_space_opt ">" let html_closing_tag = "&lt;/" html_tag_name html_space_opt ">" let html_closed_tag = "&lt;"  html_tag_name html_space_opt "/>"

let articlename = ['a'-'zA'-'Z0'-'9:_+,. {}''\\128'-'\\255']+

rule token = parse '\\n' { NL } | '\\r' { token lexbuf } | whitespace + { SP } | ['#''*'] +     { LIST } | "=" +     { EQ } | '\\ '\\ +     { Q } | "" '-' *     { HR } | "&lt;pre>" anything_but_close_pre "&lt;/pre>" { PRE } | "&lt;nowiki>" anything_but_close_nowiki "&lt;/nowiki>" { NOWIKI } | "&lt;math>" anything_but_close_math "&lt;/math>" { MATH } | "&lt;!--" anything_but_close_comment "-->" { token lexbuf } | ""     { LINK }  | "[[" articlename ""      { LINK_DEFAULT }  | "]]" { LINK_CLOSE } | "August" { VAR_CURRENTMONTH } | "27"     { VAR_CURRENTDAY } | "2024"     { VAR_CURRENTYEAR } | "Tuesday" { VAR_CURRENTDAYNAME } | ""     { VAR_CURRENTTIME } | ""     { VAR_NUMBEROFARTICLES } | ":"     { COLON } | ";"     { SEMI } | " "      { T3 } | " ~ "     { T4 } | entity_dec { ENT_DEC } | "RFC" " " ? digit + { RFC } | "ISBN" " " ? ['0'-'9X-'] + { ISBN } | entity_hex { ENT_HEX } | entity_named { ENT_NAMED } | html_opening_tag { parse_html_opening_tag (Lexing.lexeme lexbuf) } | html_closing_tag { parse_html_closing_tag (Lexing.lexeme lexbuf) } | html_closed_tag { parse_html_closed_tag (Lexing.lexeme lexbuf) } | anything { LIT } | eof { EOF } (*   urls and [urls]    HTML and entities - of course there should be parsing and validation here    articlename - needs to parse variables inside, needs to check what        is allowed and what is not    some unicode magic ? *)

tokens.ml
type t = NL | SP | LIST | EQ | Q | HR | PRE | NOWIKI | MATH | LINK | LINK_DEFAULT | LINK_CLOSE | VAR_CURRENTMONTH | VAR_CURRENTDAY | VAR_CURRENTYEAR | VAR_CURRENTDAYNAME | VAR_CURRENTTIME | VAR_NUMBEROFARTICLES | COLON | SEMI | T3 | T4       | ENT_DEC | RFC | ISBN | ENT_HEX | ENT_NAMED | O_P | O_H1 | O_H2 | O_H3 | O_H4 | O_H5 | O_H6 | C_P | C_H1 | C_H2 | C_H3 | C_H4 | C_H5 | C_H6 | O_UL | O_OL | O_LI | O_TABLE | O_TR | O_TH | O_TD | C_UL | C_OL | C_LI | C_TABLE | C_TR | C_TH | C_TD | O_B | O_I | O_EM | O_STRONG | C_B | C_I | C_EM | C_STRONG | O_U | O_BIG | O_SMALL | O_SUB | O_SUP | C_U | C_BIG | C_SMALL | C_SUB | C_SUP | O_CITE | O_CODE | O_S | O_STRIKE | O_TT | O_VAR | C_CITE | C_CODE | C_S | C_STRIKE | C_TT | C_VAR | O_DIV | O_CENTER | O_BLOCKQUOTE | O_CAPTION | C_DIV | C_CENTER | C_BLOCKQUOTE | C_CAPTION | O_RUBY | O_RT | O_RB | O_RP | O_DT | O_DD | C_RUBY | C_RT | C_RB | C_RP | C_DT | C_DD | Z_BR | Z_HR | Z_TR | Z_TH | Z_TD | LIT | EOF

let char_code_0 = 48 let char_code_9 = 57 let char_code_a = 97 let char_code_z = 122 let char_code_A = 65 let char_code_Z = 90

let find_eotn str n0 = let rec find_eotn_aux n = try let c = Char.code (String.get str n)           in if (c >= char_code_0 &amp&amp c &lt;= char_code_9) || (c >= char_code_a &amp&amp c &lt;= char_code_z) || (c >= char_code_A &amp&amp c &lt;= char_code_Z) then find_eotn_aux (n+1) else n       with _ -> n    in find_eotn_aux n0

let parse_html_opening_tag str = let l   = String.length str in let eotn = find_eotn str 1 in let tn  = String.lowercase (String.sub str 1 (eotn-1)) in LIT let parse_html_closing_tag str = let l   = String.length str in let eotn = find_eotn str 2 in let tn  = String.lowercase (String.sub str 2 (eotn-2)) in match tn with "p"         -> C_P | "h1"        -> C_H1 | "h2"        -> C_H2 | "h3"        -> C_H3 | "h4"        -> C_H4 | "h5"        -> C_H5 | "h6"        -> C_H6 | "ul"        -> C_UL | "ol"        -> C_OL | "li"        -> C_LI | "table"     -> C_TABLE | "tr"        -> C_TR | "th"        -> C_TH | "td"        -> C_TD | "b"         -> C_B | "i"         -> C_I | "em"        -> C_EM | "strong"    -> C_STRONG | "u"         -> C_U | "big"       -> C_BIG | "small"     -> C_SMALL | "sub"       -> C_SUB | "sup"       -> C_SUP | "cite"      -> C_CITE | "code"      -> C_CODE | "s"         -> C_S | "strike"    -> C_STRIKE | "tt"        -> C_TT | "var"       -> C_VAR | "div"       -> C_DIV | "center"    -> C_CENTER | "blockquote" -> C_BLOCKQUOTE | "caption"   -> C_CAPTION | "ruby"      -> C_RUBY | "rt"        -> C_RT | "rb"        -> C_RB | "rp"        -> C_RP | "dt"        -> C_DT | "dd"        -> C_DD | _           -> LIT let parse_html_closed_tag str = let l   = String.length str in let eotn = find_eotn str 1 in let tn  = String.lowercase (String.sub str 1 (eotn-1)) in match tn with "br"       -> Z_BR (* validate that attrs is empty *) | "hr"       -> Z_HR (* validate that attrs is empty *) | "tr"       -> Z_TR (* validate attrs *) | "th"       -> Z_TH (* validate attrs *) | "td"       -> Z_TD (* validate attrs *) | _          -> LIT