Extension:FileIndexer/Code.Pre.MediaWiki.1.11
From MediaWiki.org
<?php $wgHooks['UploadForm:BeforeProcessing'][] = 'ScanFileForIndex'; //Testing if these commands are available if($wgFileIndexerDebug==true) { isCommandPresent("/usr/bin/pdftotext"); isCommandPresent("/usr/bin/iconv"); isCommandPresent("/usr/bin/antiword"); isCommandPresent("/usr/bin/xls2csv"); isCommandPresent("/usr/bin/catppt"); isCommandPresent("/usr/bin/strings"); } function ScanFileForIndex($uploadFormObj) { $NewDesc = ''; $RemoveTags = false; //remove HTML-Tags created during conversion? $extension = substr(strrchr($uploadFormObj->mDestFile, '.'),1); //extract the extension of the destination filename switch(strtolower($extension)) //methods for text extraction { case "pdf": { //using XPDF and iconv for conversion purposes $toexec = "/usr/local/bin/pdftotext -raw -nopgbrk " . $uploadFormObj->mUploadTempName . " -"; // Alternative: $toexec = "/usr/bin/pdftotext -raw -nopgbrk " . $uploadFormObj->mUploadTempName . " -"; $toexec.="| iconv -f ISO-8859-1 -t UTF-8"; break; } case "dot": {} case "doc": { //using antiword $toexec = "/usr/bin/antiword -s ".$uploadFormObj->mUploadTempName; break; } case "xls": { $toexec = "/usr/bin/xls2csv ".$uploadFormObj->mUploadTempName; break; } case "ppt": { $toexec = "/usr/bin/catppt ".$uploadFormObj->mUploadTempName; break; } case "rtf": # any file extension with text in it will be okay here { $toexec = "/usr/bin/strings ".$uploadFormObj->mUploadTempName; # string's output isn't neat, but it works. break; } //OpenOffice.org documents case "ods": {} case "odp": {} case "odg": {} case "odt": { $toexec = "unzip -p " . $uploadFormObj->mUploadTempName . " content.xml"; $RemoveTags = true; break; } } if ($toexec != "") { exec($toexec, $DocText); $NewDesc = $uploadFormObj->mUploadDescription . "\r\n" . "<!-- "; foreach ($DocText as $DocLine) { if($RemoveTags == false) { $NewDesc .= "\r\n" . str_replace("-->","",$DocLine); } else { $NewDesc .= "\r\n" . strip_tags(str_replace("-->","",$DocLine)); } } $NewDesc .= "\r\n" . " -->"; $uploadFormObj->mUploadDescription = $NewDesc; } return $uploadFormObj; } function isCommandPresent($command) { if(file_exists($command)==false) { //extract the command from the path $lastSlash = strrpos($command, '/'); if($lastSlash!='') { $commandWithoutSlashes = substr($command, $lastSlash+1); } else { $commandWithoutSlashes = $command; } $toexec = "whereis $commandWithoutSlashes"; //lookup the command exec($toexec, $whereis); echo "FileIndexer: The file $command is missing ... whereis result: $whereis[0] <br>"; } } /** * Add extension information to Special:Version */ $wgExtensionCredits['other'][] = array( 'name' => 'FileIndexer', 'author' => 'MHart and Flominator', 'description' => 'makes uploaded documents searchable', 'url' => 'http://www.mediawiki.org/wiki/Extension:FileIndexer' );
