Wikimedia Security Team/SVG filter changes

From mediawiki.org

If you're changing the svg filter, sometimes you want to make sure you didn't accidentally blacklist something that's used everywhere.

Here is a script you can use to check recent uploads. This requires a separate unrelated MW install, and a connection to a db containing the image table (I used the one at tool labs for this). Its a tad hacky, but hey it works.

<?php
/**
 * Maintenance script to check if recent uploads still pass filters.
 *
 * This is meant to check if changes to filter will cause problems.
 * The idea being that one would run this script on a modified version
 * of mediawiki with the filter changes, on somewhere like tool labs,
 * and it will download recent uploads from commons and verify they'd still
 * work.
 */
require_once __DIR__ . "/Maintenance.php";

class CheckSVGs extends Maintenance {
	private $lastFileDate;
	private $lastFileName;
	private $upload;
	private $foreignDB;

	public function __construct() {
		parent::__construct();
		$this->addDescription( "Check old svgs to see if they are still valid" ); 
		$this->addOption( 'until', 'Date to stop checking', false, true );
		$this->addOption( 'basepath', 'Path for remote files', false, true );
		$this->addOption( 'fdbserver', 'Foreign DB server', true, true );
		$this->addOption( 'fdbuser', 'Foreign DB user', true, true );
		$this->addOption( 'fdbpassword', 'Foreign DB pass', true, true );
		$this->addOption( 'fdbname', 'Foreign db name', true, true );
		$this->addOption( 'fdbtype', 'Foreign db type (default mysql)', false, true );
		$this->addOption( 'max-size', 'Max size to check (in bytes)', false, true );
		$this->setBatchSize( 500 );
	}


	/**
	 * This is meant to connect to a tool labs db.
	 */
	private function getDBConnection() {
		$type = $this->getOption( 'fdbtype', 'mysql' );

		$params = [
			'host' => $this->getOption( 'fdbserver' ),
			'user' => $this->getOption( 'fdbuser' ),
			// FIXME, really shouldn't be passed on command line.
			'password' => $this->getOption( 'fdbpassword' ),
			'dbname' => $this->getOption( 'fdbname' ),
			'foreign' => true,
		];
		$db = Database::factory( $type, $params );
		if ( !$db ) {
			$this->error( "Could not get db" );
			exit(1);
		}
		return $db;
	}

	public function execute() {
		$this->db = $this->getDBConnection();
		$this->upload = new UploadDummy;
		$tot = 0;
		$bad = 0;
		while ( $candidates = $this->getCandidates() ) {
			foreach( $candidates as $candidate ) {
				$file = $this->getFile( $candidate );
				$res = $this->checkFile( $file );
				$file = '';
				if ( $res !== true ) {
					$bad++;
					echo "Error ($res): $candidate\n";
				}
				$tot++;
			}
			echo "\tDone batch ($tot) - at: " . $candidates[count($candidates)-1] . "\n";
		}
		echo "Complete: $tot total; $bad bad\n";
	}

	private function checkFile( $file ) {
		return $this->upload->checkFile( $file );
	}

	private function getFile( $filename ) {
		$url = $this->getUrlForFilename( $filename );
		$svg = Http::get( $url, [ 'userAgent' => 'SVG validation script. https://www.mediawiki.org/w/index.php?title=Wikimedia_Security_Team/SVG_filter_changes' ] );
		if ( $svg === false ) {
			throw new Exception( "Could not download file - $url" );
		}
		return $svg;
	}

	private function getUrlForFilename( $filename ) {
		// e.g. https://upload.wikimedia.org/wikipedia/commons/
		$basepath = $this->getOption( 'basepath', 'https://upload.wikimedia.org/wikipedia/commons/' );
		$md5 = md5( $filename );
		$basepath .= substr( $md5, 0, 1 ) . '/' . substr( $md5, 0, 2 ) . '/';
		$basepath .= rawurlencode( $filename );
		return $basepath;
	}

	private function getCandidates() {
		$conds = [
			'img_size < ' . ( (int)$this->getOption( 'max-size', 1024*1024*10 ) ),
			'img_major_mime' => 'image',
			'img_minor_mime' => 'svg+xml',
			'img_media_type' => 'DRAWING'
		];
		if ( $this->lastFileDate ) {
			$conds[] = 'img_timestamp < ' . $this->db->addQuotes( $this->lastFileDate );
                        // FIXME, would be better if handling condition where a lot of images have same timestamp.
		}

		$until = $this->getOption( 'until' );
		if ( $until ) {
			$conds[] =
				'img_timestamp > ' . $this->db->addQuotes( $until );
		}

		$res = $this->db->select(
			'image',
			[ 'img_name', 'img_timestamp' ],
			$conds,
			__METHOD__,
			[
				'ORDER BY' => 'img_timestamp desc, img_name asc',
				'LIMIT' => $this->getOption( 'batch-size' )

			]
		);

		$actualResults = [];
		foreach ( $res as $row ) {
			$actualResults[] = $row->img_name;
			$this->lastFileName = $row->img_name;
			$this->lastFileDate = $row->img_timestamp;
		}
		return $actualResults;
	}
}

// Make sure UploadBase can be found.
require_once __DIR__ . "/../includes/AutoLoader.php";

class UploadDummy extends UploadBase {
	public function initializeFromRequest( &$request ) {}

	/**
	 * @param String Full contents of svg file
	 *
	 * @return true if ok, or string for error code
	 */
	public function checkFile( $file ) {
		$this->mSVGNSError = false;
		$check = new XmlTypeCheck(
			$file,
			[ $this, 'checkSvgScriptCallback' ],
			false, /* string not filename */
			[ 'processing_instruction_handler' => 'UploadBase::checkSvgPICallback' ]
		);
		if ( $check->wellFormed !== true ) {
			return 'uploadinvalidxml';
		} elseif ( $check->filterMatch ) {
			if ( $this->mSVGNSError ) {
				return $this->mSVGNSError;
			}
			return $check->filterMatchType;
		}
		return true;
	}
}

$maintClass = "CheckSVGs";
require_once RUN_MAINTENANCE_IF_MAIN;