Toolserver:Python/MySQL bulk load

From mediawiki.org
Jump to navigation Jump to search
Wikimedia Community Logo-Toolserver.svg

This page was moved from the Toolserver wiki.
Toolserver has been replaced by Toolforge. As such, the instructions here may no longer work, but may still be of historical interest.
Please help by updating examples, links, template links, etc. If a page is still relevant, move it to a normal title and leave a redirect.

# Google Maps link extractor
# This demonstrates LOAD DATA LOCAL INFILE in an ETL operations in python using 
# oursql in conjunction with a Minimum bounding rectangle spatial (MBR) index.
# 
# Dispenser 2011, Public domain
import os, re, oursql

mysql_byte_escape = dict((chr(i), chr(i)) for i in range(256))
mysql_byte_escape.update({
    # Bytes escape
    b'\0':   b'\\0',
    b'\b':   b'\\b',
    b'\n':   b'\\n',
    b'\r':   b'\\r',
    b'\t':   b'\\t',
    b'\x16': b'\\Z',
    b'\\':   b'\\\\',
})
def MySQL_format(s, encoding='utf-8'):
    if s is None:
        return b'\\N'
    elif isinstance(s, bytes):
        return b''.join(map(mysql_byte_escape.__getitem__, s))
    elif isinstance(s, unicode):
        return b''.join(map(mysql_byte_escape.__getitem__, s.encode(encoding)))
    elif isinstance(s, bool):
        return b'1' if s else b'0'
    elif isinstance(s, (int, long)):
        return bytes(s)
    elif isinstance(s, float):
        # repr() does not round unlike str()
        return repr(s)
    else:
        raise TypeError

def main():
    # Database_name.target_table_name
    spatial_table = "u_dispenser_p.gmap_coordinates"
    # Temporary file, can also be imported into Excel
    filename='./gmaps_extract.tsv'
    wiki = 'enwiki_p'
    conn = oursql.connect(
        db=wiki,
        host=wiki.replace('_', '-')+'.userdb.toolserver.org',
        read_default_file=os.path.expanduser("~/.my.cnf"),
        local_infile=True # IMPORTANT
    )
    # Extract
    cursor = conn.cursor()
    cursor.execute("SELECT el_from, el_to FROM externallinks WHERE el_to LIKE ?", ("http://maps.google.com/%",))
    
    # Transform
    with open(filename, 'wb') as f:
        query_location_R = re.compile(r'([&?]q=([^&]*)|).*?[&?]s?ll=([0-9.+-]+),([0-9.+-]+)')
        for el_from, el_to in cursor:
            m = query_location_R.search(el_to)
            if not m:continue
            params = (el_from, float(m.group(3)), float(m.group(4)), m.group(2) and m.group(2)[:255],)
            # data is escaped using our MySQL_format so tabs can be used to separate columns and 
            # newline separate rows.  Some special characters are also escaped.
            f.write(b'\t'.join((MySQL_format(s) for s in params)) )
            f.write(b'\n')
    
    # Load
    cursor.execute("""
CREATE TABLE """+spatial_table+""" (
    gm_from  INT(10) UNSIGNED NOT NULL,
    gm_lat   DOUBLE NOT NULL,
    gm_lng   DOUBLE NOT NULL,
    gm_query VARBINARY(255) NULL,
    gm_point POINT NOT NULL,
    KEY gm_from (gm_from),
    SPATIAL KEY gm_point (gm_point)
) ENGINE = MyISAM;
""")
    cursor.execute("""
LOAD DATA LOCAL INFILE '"""+filename+"""'
  INTO TABLE """+spatial_table+"""
/* Here it the same as the default.  See manual more formats like CSV */
  FIELDS TERMINATED BY '\\t' 
  LINES TERMINATED BY '\\n' 
  (gm_from,gm_lat,gm_lng,gm_query) 
/* MySQL's internal geometery is different from WkT and WkB, so we create it on the fly */
  SET gm_point = GeomFromText(CONCAT('POINT(',gm_lat,' ',gm_lng,')'))
""")

if __name__ == "__main__":
    main()

Category:Code