User:Isarra/grabFiles.php

From mediawiki.org
<?php
/**
 * Grabs files from a pre-existing wiki into a new wiki.
 * Merge back into grabImages or something later.
 *
 * @file
 * @ingroup Maintenance
 * @author Calimonious the Estrange
 * @date 31 December 2012
 * @note Based on code by Misza, Jack Phoenix and Edward Chernenko.
 */

# Because we're not in maintenance
ini_set( 'include_path', dirname( __FILE__ ) . '/../maintenance' );

require_once( 'Maintenance.php' );
require_once( 'mediawikibot.class.php' );

class GrabFiles extends Maintenance {
	public function __construct() {
		parent::__construct();
		$this->mDescription = 'Grabs files from a pre-existing wiki into a new wiki. Assumes a normal file hashing structure.';
		$this->addOption( 'url', 'URL to the target wiki\'s api.php', true /* required? */, true /* withArg */, 'u' );
		$this->addOption( 'username', 'Username to log into the target wiki', false, true, 'n' );
		$this->addOption( 'password', 'Password on the target wiki', false, true, 'p' );
		$this->addOption( 'db', 'Database name, if we don\'t want to write to $wgDBname', false, true );
		$this->addOption( 'from', 'Name of file to start from', false, true );
		$this->addOption( 'enddate', 'Date after which to ignore new files (20121222142317, 2012-12-22T14:23:17T, etc)', false, true );
	}

	public function execute() {
		global $wgUploadDirectory, $endDate;

		$endDate = $this->getOption( 'enddate' );
		if ( $endDate ) {
			$endDate = wfTimestamp( TS_MW, $endDate );
			if ( !$endDate ) {
				$this->error( "Invalid enddate format.\n", true );
			}
		} else {
			$endDate = wfTimestampNow();
		}

		$url = $this->getOption( 'url' );
		if ( !$url ) {
			$this->error( 'The URL to the target wiki\'s api.php is required.', true );
		}
		$user = $this->getOption( 'username' );
		$password = $this->getOption( 'password' );

		$this->output( "Working...\n" );

		# bot class and log in if requested
		if ( $user && $password ) {
			$bot = new MediaWikiBot(
				$url,
				'json',
				$user,
				$password,
				'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1'
			);
			if ( !$bot->login() ) {
				print "Logged in as $user...\n";
			} else {
				print "WARNING: Failed to log in as $user.\n";
			}
		} else {
			$bot = new MediaWikiBot(
				$url,
				'json',
				'',
				'',
				'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1'
			);
		}

		$params = array(
			'generator' => 'allimages',
			'gailimit' => 500,
			'prop' => 'imageinfo',
			'iiprop' => 'timestamp|user|userid|comment|url|size|sha1|mime|metadata|archivename|bitdepth|mediatype',
			'iilimit' => 500
		);

		$gaifrom = $this->getOption( 'gaifrom' );
		$more = true;
		$count = 0;

		$this->output( "Processing and downloading  files...\n" );
		while ( $more ) {
			if ( $gaifrom === null ) {
				unset( $params['gaifrom'] );
			} else {
				$params['gaifrom'] = $gaifrom;
			}
			$result = $bot->query( $params );
			if ( empty( $result['query']['pages'] ) ) {
				$this->error( 'No files found...', true );
			}

			foreach ( $result['query']['pages'] as $file ) {
				$count = $count + $this->processFile( $file );
			}

			if ( isset( $result['query-continue'] ) ) {
				$gaifrom = $result['query-continue']['allimages']['gaifrom'];
			} else {
				$gaifrom = null;
			}
			$more = !( $gaifrom === null );
		}
		$this->output( "$count files downloaded.\n" );
	}

	function processFile( $entry ) {
		global $wgDBname, $wgUploadDirectory, $endDate;

		$name = $entry['title'];
		$name = preg_replace( '/^[^:]*?:/', '', $name );
		$name = str_replace( ' ', '_', $name );

		# Check if file already exists.
		$file = wfFindFile( $name );
		if ( is_object( $file ) ) {
			return 0;
		}

		$this->output( "Processing {$entry['title']}: " );
		$count = 0;

		foreach ( $entry['imageinfo'] as $fileVersion ) {
			if ( !$count && $endDate < wfTimestamp( TS_MW, $fileVersion['timestamp'] ) ) {
				return 0;
			}
			$fileurl = $fileVersion['url'];

			if ( isset( $fileVersion['archivename'] ) ) {
				# Old version
				$e = array(
					'oi_name' => $name,
					'oi_archive_name' => $fileVersion['archivename'],
					'oi_size' => $fileVersion['size'],
					'oi_width' => $fileVersion['width'],
					'oi_height' => $fileVersion['height'],
					'oi_bits' => $fileVersion['bitdepth'],
					'oi_description' => $fileVersion['comment'],
					'oi_user' => $fileVersion['userid'],
					'oi_user_text' => $fileVersion['user'],
					'oi_timestamp' => wfTimestamp( TS_MW, $fileVersion['timestamp'] ),
					'oi_media_type' => $fileVersion['mediatype'],
					'oi_deleted' => 0,
					'oi_sha1' => $fileVersion['sha1'],
					'oi_metadata' => serialize( $fileVersion['metadata'] )
				);

				$mime = $fileVersion['mime'];
				$mimeBreak = strpos( $mime, '/' );
				$e['oi_major_mime'] = substr( $mime, 0, $mimeBreak );
				$e['oi_minor_mime'] = substr( $mime, $mimeBreak + 1 );

				$dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) );

				$dbw->insert( 'oldimage', $e, __METHOD__ );
				$dbw->commit();

				$fileLocalPath = $wgUploadDirectory . "/archive" . substr( $fileurl, -1 * ( strlen( $name ) + 21 ) );
				$fileLocalDir = $wgUploadDirectory . "/archive" . substr( $fileurl, -1 * ( strlen( $name ) + 21 ), -1 * ( strlen( $name ) + 15 ) );
			} else {
				# Current version
				# Check if title is present in database because someone screwed up
				$dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) );
				$result = $dbr->select(
					'image',
					'img_name',
					array( 'img_name' => $name ),
					__METHOD__
				);
				if ( !$dbr->fetchObject( $result ) ) {
					$e = array(
						'img_name' => $name,
						'img_size' => $fileVersion['size'],
						'img_width' => $fileVersion['width'],
						'img_height' => $fileVersion['height'],
						'img_metadata' => serialize( $fileVersion['metadata'] ),
						'img_bits' => $fileVersion['bitdepth'],
						'img_media_type' => $fileVersion['mediatype'],
						'img_description' => $fileVersion['comment'],
						'img_user' => $fileVersion['userid'],
						'img_user_text' => $fileVersion['user'],
						'img_timestamp' => wfTimestamp( TS_MW, $fileVersion['timestamp'] ),
						'img_sha1' => $fileVersion['sha1']
					);

					$mime = $fileVersion['mime'];
					$mimeBreak = strpos( $mime, '/' );
					$e['img_major_mime'] = substr( $mime, 0, $mimeBreak );
					$e['img_minor_mime'] = substr( $mime, $mimeBreak + 1 );

					$dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) );

					$dbw->insert( 'image', $e, __METHOD__ );
					$dbw->commit();
				}

				$fileLocalPath = $wgUploadDirectory . substr( $fileurl, -1 * ( strlen( $name ) + 6 ) );
				$fileLocalDir = $wgUploadDirectory . substr( $fileurl, -1 * ( strlen( $name ) + 6 ), -1 * ( strlen( $name ) ) );
			}

			wfSuppressWarnings();
			$fileContent = file_get_contents( $fileurl );
			wfRestoreWarnings();
			if ( !$fileContent ) {
				$this->output( "$fileName not found on remote server.\n" );
				continue;
			}

			# Directory structure and save
			if ( !file_exists( $fileLocalDir ) ) {
				mkdir( $fileLocalDir, 0777, true );
			}
			file_put_contents( $fileLocalPath, $fileContent );

			$count++;
		}
		if ( $count == 1 ) {
			$this->output( "1 revision\n" );
		} else {
			$this->output( "$count revisions\n" );
		}

		return $count;
	}
}

$maintClass = 'GrabFiles';
require_once( RUN_MAINTENANCE_IF_MAIN );