Template:SchemaDiagram/sql/wikibase-repo-1.35.txt

From mediawiki.org
-- MySQL version of the database schema for the Wikibase extension.
-- Licence: GNU GPL v2+


-- TODO: figure out which lengths to use for some of the varchar fields.


-- Unique ID generator.
CREATE TABLE IF NOT EXISTS /*_*/wb_id_counters (
  id_value                   INT unsigned        NOT NULL,
  id_type                    VARBINARY(32)       NOT NULL
) /*$wgDBTableOptions*/;

CREATE UNIQUE INDEX /*i*/wb_id_counters_type ON /*_*/wb_id_counters (id_type);



-- Derived storage.
-- Links site+title pairs to item ids.
CREATE TABLE IF NOT EXISTS /*_*/wb_items_per_site (
  ips_row_id                 BIGINT unsigned     NOT NULL PRIMARY KEY AUTO_INCREMENT, -- row ID
  ips_item_id                INT unsigned        NOT NULL, -- Id of the item
  ips_site_id                VARBINARY(32)       NOT NULL, -- Site identifier (global)
  ips_site_page              VARCHAR(310)        NOT NULL -- Prefixed title of the page
) /*$wgDBTableOptions*/;

CREATE UNIQUE INDEX /*i*/wb_ips_item_site_page ON /*_*/wb_items_per_site (ips_site_id, ips_site_page);
CREATE INDEX /*i*/wb_ips_item_id ON /*_*/wb_items_per_site (ips_item_id);



-- Lookup table for entity terms (ie labels, aliases, descriptions).
-- NOTE: keep the Wikimedia specific terms.wmf.sql in sync with this!
CREATE TABLE IF NOT EXISTS /*_*/wb_terms (
  term_row_id                BIGINT unsigned     NOT NULL PRIMARY KEY AUTO_INCREMENT, -- row ID
  term_entity_id             INT unsigned        NOT NULL, -- Id of the entity
  term_full_entity_id        VARBINARY(32)       DEFAULT NULL, -- Full id of the entity (not only numeric part)
  term_entity_type           VARBINARY(32)       NOT NULL, -- Type of the entity
  term_language              VARBINARY(32)       NOT NULL, -- Language code
  term_type                  VARBINARY(32)       NOT NULL, -- Term type
  term_text                  VARCHAR(255) binary NOT NULL, -- The term text
  term_search_key            VARCHAR(255) binary NOT NULL, -- The term text, lowercase for case-insensitive lookups
  term_weight                FLOAT UNSIGNED     NOT NULL DEFAULT 0.0 -- weight for ranking
) /*$wgDBTableOptions*/;

-- Indexes and comments below adopted from the suggestions Sean Pringle made
-- at https://phabricator.wikimedia.org/T47529#518941 based on a
-- live analysis of queries on wikidata.org in January 2014.
-- NOTE: keep these in sync with UpdateTermIndexes.sql


-- Some wb_terms queries use term_full_entity_id_s=X which is good selectivity.
CREATE INDEX /*i*/term_full_entity ON /*_*/wb_terms (term_full_entity_id);

-- T204837
CREATE INDEX /*i*/wb_terms_text ON /*_*/wb_terms (term_text);

-- T204838
CREATE INDEX /*i*/wb_terms_search_key ON /*_*/wb_terms (term_search_key);

-- This index has good selectivity while still allowing ICP for short string values.
CREATE INDEX /*i*/term_search_full ON /*_*/wb_terms (term_language, term_full_entity_id, term_type, term_search_key(16));

-- T202265
CREATE INDEX /*i*/tmp1 ON /*_*/wb_terms (`term_language`, `term_type`, `term_entity_type`, `term_search_key`);

-- T204836
CREATE INDEX /*i*/wb_terms_entity_id ON /*_*/wb_terms (`term_entity_id`);

-- General info and links:
-- -----------------------
--
-- These tables represent a normalized form of the wb_terms table.
--
-- The wb_terms table history leading to this point can be all followed starting
-- from this epic https://phabricator.wikimedia.org/T208425
--
-- The current solution is part of that epic. work on it can be followed starting
-- from this ticket https://phabricator.wikimedia.org/T219175
--
-- This design is normalized for the sake of reducing duplication and redundancy
-- that accounted for ~70% of the disk space occupied by wb_terms table as of
-- March 2019. It requires few joins for queries, and impact on performance
-- will be addressed as the need arise.
--
-- Note on splitting Items and Properties:
-- ---------------------------------------
-- wb_item_terms an wb_property_terms are two tables with similar structure.
-- original draft had them combined in one table, but further discussion lead
-- to the decision of splitting them for the following reasons:
-- - items are lot more and will grow faster than properties, which means
--   when querying only for properties we need not to look at a gigantic
--   index that includes items as well, but rather a very small one.
-- - conceptually, items and properties are two different entity types, and
--   since different entity types might have different constraints and maybe
--   even structure, splitting them into corresponding tables saves us from
--   ending up with a wide polymorphic table again (lesson learned from
--   wb_terms table itself)
--
-- Note on Entity IDs:
-- ------------------
-- Since Wikidata entity ids are fixed and known to always be Q (like in Q123),
--  entity ids are stored as integers after dropping that prefix, in both
-- wb_item_terms and wb_property_terms tables.
-- e.g. entity id Q123 has integer id of 123.
--
-- when a new entity type gets introduced into the instance then one can either:
-- - store it in wb_item_terms/wb_property_items if it has the same prefix (Q), or
-- - create a separate table similar to wb_item_terms in structure for those
--   entities if the prefix is different.


-- stores a record per term per item per language. this table is expected to be the longest one
-- in this group of tables.
--
-- term text, type and language are normalized further through wb_term_in_lang
-- table.
CREATE TABLE IF NOT EXISTS /*_*/wbt_item_terms (
  wbit_id                                BIGINT unsigned       NOT NULL PRIMARY KEY AUTO_INCREMENT,
  wbit_item_id                           INT    unsigned       NOT NULL,
  wbit_term_in_lang_id                   INT    unsigned       NOT NULL
) /*$wgDBTableOptions*/;

CREATE INDEX /*i*/wbt_item_terms_item_id ON /*_*/wbt_item_terms (wbit_item_id);
CREATE UNIQUE INDEX /*i*/wbt_item_terms_term_in_lang_id_item_id ON /*_*/wbt_item_terms (wbit_term_in_lang_id, wbit_item_id);

-- stores a record per term per property per language.
--
-- term text, type and language are normalized further through wb_term_in_lang
-- table.
CREATE TABLE IF NOT EXISTS /*_*/wbt_property_terms (
  wbpt_id                                INT unsigned       NOT NULL PRIMARY KEY AUTO_INCREMENT,
  wbpt_property_id                       INT unsigned       NOT NULL,
  wbpt_term_in_lang_id                   INT unsigned       NOT NULL
) /*$wgDBTableOptions*/;

CREATE INDEX /*i*/wbt_property_terms_property_id ON /*_*/wbt_property_terms (wbpt_property_id);
CREATE UNIQUE INDEX /*i*/wbt_property_terms_term_in_lang_id_property_id ON /*_*/wbt_property_terms (wbpt_term_in_lang_id, wbpt_property_id);

-- stores a record per term per text per language.
--
-- term text and language are normalized further through wb_text_in_lang table.
CREATE TABLE IF NOT EXISTS /*_*/wbt_term_in_lang (
  wbtl_id                          INT unsigned       NOT NULL PRIMARY KEY AUTO_INCREMENT,
  wbtl_type_id                     INT unsigned       NOT NULL,
  wbtl_text_in_lang_id             INT unsigned       NOT NULL
) /*$wgDBTableOptions*/;

CREATE INDEX /*i*/wbt_term_in_lang_type_id_text_in ON /*_*/wbt_term_in_lang (wbtl_type_id);
CREATE UNIQUE INDEX /*i*/wbt_term_in_lang_text_in_lang_id_lang_id ON /*_*/wbt_term_in_lang (wbtl_text_in_lang_id, wbtl_type_id);

-- stores a record per term text per language.
--
-- text is normalized through wb_term_text table.
CREATE TABLE IF NOT EXISTS /*_*/wbt_text_in_lang (
  wbxl_id                         INT unsigned       NOT NULL PRIMARY KEY AUTO_INCREMENT,
  wbxl_language                   VARBINARY(20)      NOT NULL,
  wbxl_text_id                    INT unsigned       NOT NULL
) /*$wgDBTableOptions*/;

CREATE INDEX /*i*/wbt_text_in_lang_language ON /*_*/wbt_text_in_lang (wbxl_language);
CREATE UNIQUE INDEX /*i*/wbt_text_in_lang_text_id_text_id ON /*_*/wbt_text_in_lang (wbxl_text_id, wbxl_language);

-- stores a record per text value that are used in different terms
-- in different languages.
CREATE TABLE IF NOT EXISTS /*_*/wbt_text (
  wbx_id                             INT unsigned         NOT NULL PRIMARY KEY AUTO_INCREMENT,
  wbx_text                           VARBINARY(255)       NOT NULL
) /*$wgDBTableOptions*/;

CREATE UNIQUE INDEX /*i*/wbt_text_text ON /*_*/wbt_text (wbx_text);

-- normalized term type names
CREATE TABLE IF NOT EXISTS /*_*/wbt_type (
  wby_id                                INT unsigned         NOT NULL PRIMARY KEY AUTO_INCREMENT,
  wby_name                              VARBINARY(45)        NOT NULL
) /*$wgDBTableOptions*/;

CREATE UNIQUE INDEX /*i*/wbt_type_name ON /*_*/wbt_type (wby_name);