Proposed Database Schema Changes/October 2004

This is in progress... working code is in CVS branch SCHEMA_WORK.

The current cur/old table split is kind of ugly; it is hard to work with and slows down aggregate operations by putting large amounts of bulk text into a table that is frequently being looked over for the smaller metadata fields.

Here's a color-coded diagram of the proposed split and how the old fields (on the left) map to the new ones (on the right):
 * [[Image:Database-restructure.png]]

(page_latest points to the rev_id/old_id of the page's current revision.)

Advantages of new structure:
 * Renames don't require changing the revision or text table, only 'page' and link updates.
 * Aggregate operations don't need to slog through text-laden tables.
 * Don't need to merge cur and old manually in things like contribs
 * rev_id will be a unique id for both current and old revisions (and will retain existing old_id values after conversion)
 * and more!

Potential changes:
 * If text.old_id and rev_id are decoupled we could avoid storing a second copy of the text on reverts. Not sure if this is worth the trouble.

-- D A N G E R -- Don't execute this SQL script in a replicated environment!! -- Replication does not replicate the values of @variables!!!

d'oh! will need to fix this :)

-- The Great Restructuring of October 2004 -- Creates 'page', 'revision' tables and transforms the classic -- cur+old into a separate page+revision+text structure. -- -- The pre-conversion 'old' table is renamed to 'text' and used -- without internal restructuring to avoid rebuilding the entire -- table. (This can be done separately if desired.) -- -- The pre-conversion 'cur' table is now redundant and can be -- discarded when done. -- -- Before starting change MySQL settings for bulk loading. -- Use 4-32M buffers. Big enough for one batch of old records. -- set-variable   = innodb_flush_log_at_trx_commit=0 -- set-variable   = read_buffer_size=4M -- set-variable   = read_rnd_buffer_size=4M -- set-variable   = innodb_log_buffer_size=4M -- set-variable   = bulk_insert_buffer_size=4M

CREATE TABLE page ( page_id int(8) unsigned NOT NULL auto_increment,  page_namespace tinyint NOT NULL,  page_title varchar(255) binary NOT NULL,  page_restrictions tinyblob NOT NULL default ,  page_counter bigint(20) unsigned NOT NULL default '0',  page_is_redirect tinyint(1) unsigned NOT NULL default '0',  page_is_new tinyint(1) unsigned NOT NULL default '0',  page_random real unsigned NOT NULL,  page_touched char(14) binary NOT NULL default ,  page_latest int(8) unsigned NOT NULL,

PRIMARY KEY page_id (page_id), UNIQUE INDEX name_title (page_namespace,page_title), INDEX (page_random) );

/* CREATE TABLE revision ( rev_id int(8) unsigned NOT NULL auto_increment,  rev_page int(8) unsigned NOT NULL,  rev_link varchar(255) binary NOT NULL default ,  rev_comment tinyblob NOT NULL default ,  rev_user int(5) unsigned NOT NULL default '0',  rev_user_text varchar(255) binary NOT NULL default ,  rev_timestamp char(14) binary NOT NULL default ,  rev_minor_edit tinyint(1) unsigned NOT NULL default '0',  inverse_timestamp char(14) binary NOT NULL default '',  PRIMARY KEY rev_page_id (rev_page, rev_id),  UNIQUE INDEX rev_id (rev_id),  INDEX rev_timestamp (rev_timestamp),  INDEX page_timestamp (rev_page,inverse_timestamp),  INDEX user_timestamp (rev_user,inverse_timestamp),  INDEX usertext_timestamp (rev_user_text,inverse_timestamp) );

-- If creating new 'text' table it would look like this: -- -- CREATE TABLE text ( --  old_id int(8) unsigned NOT NULL auto_increment, --   old_text mediumtext NOT NULL default , --   old_flags tinyblob NOT NULL default , --   old_diff int(8) unsigned NOT NULL, --   --   PRIMARY KEY old_id (old_id) -- );

-- Lock! LOCK TABLES page WRITE, revision WRITE, old WRITE, cur WRITE;

-- Save the last old_id value for later SELECT (@maxold:=MAX(old_id)) FROM old;

-- First, copy all current entries into the old table. INSERT INTO old (old_namespace,   old_title,    old_text,    old_comment,    old_user,    old_user_text,    old_timestamp,    old_minor_edit,    old_flags,    inverse_timestamp) SELECT cur_namespace, cur_title, cur_text, cur_comment, cur_user, cur_user_text, cur_timestamp, cur_minor_edit, '',   inverse_timestamp FROM cur;

-- Now, copy all old data except the text into revisions INSERT INTO revision (rev_id,   rev_page,    rev_comment,    rev_user,    rev_user_text,    rev_timestamp,    inverse_timestamp,    rev_minor_edit) SELECT old_id, cur_id, old_comment, old_user, old_user_text, old_timestamp, old.inverse_timestamp, old_minor_edit FROM old,cur WHERE old_namespace=cur_namespace AND old_title=cur_title;

-- Now, copy all old data except the text into revisions -- This revision should be significantly faster: -- Reads in old table order (the biggest, slowest job) -- Writes in revision primary key order, so quick changes --  to one article should have higher hit rate in new index cache -- Repeats moderately small selects so the select/insert --  won't be one massive transaction. Iterate until done. --  Chunk size? No idea of best. Maybe 1,000 records? -- Only order by cur_id, old_id of doing batches - else will --  have massive temporary table! -- Few indexes for faster initial load, adds more later CREATE TABLE revision ( rev_id int(8) unsigned NOT NULL auto_increment,  rev_page int(8) unsigned NOT NULL,  rev_link varchar(255) binary NOT NULL default ,  rev_comment tinyblob NOT NULL default ,  rev_user int(5) unsigned NOT NULL default '0',  rev_user_text varchar(255) binary NOT NULL default ,  rev_timestamp char(14) binary NOT NULL default ,  rev_minor_edit tinyint(1) unsigned NOT NULL default '0',  inverse_timestamp char(14) binary NOT NULL default '',  PRIMARY KEY rev_page_id (rev_page, rev_id),  UNIQUE INDEX rev_id (rev_id) ); INSERT INTO revision (rev_id,   rev_page,    rev_comment,    rev_user,    rev_user_text,    rev_timestamp,    inverse_timestamp,    rev_minor_edit) SELECT old_id, cur_id, old_comment, old_user, old_user_text, old_timestamp, old.inverse_timestamp, old_minor_edit FROM old use index (old_id),cur WHERE old_namespace=cur_namespace AND old_title=cur_title AND old_id >=xxxx AND old_id < yyyy ORDER BY cur_id, old_id; COMMIT; -- repeat until done. Then add keys. ALTER TABLE revision ADD INDEX rev_timestamp (rev_timestamp), ADD INDEX page_timestamp (rev_page,inverse_timestamp), ADD INDEX user_timestamp (rev_user,inverse_timestamp), ADD INDEX usertext_timestamp (rev_user_text,inverse_timestamp);

-- And, copy the cur data into page INSERT INTO page (page_id,   page_namespace,    page_title,    page_restrictions,    page_counter,    page_is_redirect,    page_is_new,    page_random,    page_touched,    page_latest) SELECT cur_id, cur_namespace, cur_title, cur_restrictions, cur_counter, cur_is_redirect, cur_is_new, cur_random, cur_touched, rev_id FROM cur,revision WHERE cur_id=rev_page AND rev_timestamp=cur_timestamp AND rev_id > @maxold;

UNLOCK TABLES;

-- Keep the old table around as the text store. -- Its extra fields will be ignored, but trimming them is slow -- so we won't bother doing it for now. ALTER TABLE old RENAME TO text;

-- If and when support for differential storage is written, this might be necessary: -- ALTER TABLE text ADD old_diff int(8) unsigned NOT NULL auto_increment;