Extension:FileIndexer/Code.Pre.MediaWiki.1.11

From MediaWiki.org
Jump to: navigation, search
<?php
$wgHooks['UploadForm:BeforeProcessing'][] = 'ScanFileForIndex';
 
 
//Testing if these commands are available
if($wgFileIndexerDebug==true)
{
        isCommandPresent("/usr/bin/pdftotext");
        isCommandPresent("/usr/bin/iconv");
        isCommandPresent("/usr/bin/antiword");
        isCommandPresent("/usr/bin/xls2csv");
        isCommandPresent("/usr/bin/catppt");
        isCommandPresent("/usr/bin/strings");
}
 
function ScanFileForIndex($uploadFormObj) 
{
         $NewDesc = '';
         $RemoveTags = false;  //remove HTML-Tags created during conversion?
 
         $extension = substr(strrchr($uploadFormObj->mDestFile, '.'),1); //extract the extension of the destination filename
 
          switch(strtolower($extension)) //methods for text extraction 
          {
                case "pdf": 
                {
                        //using XPDF and iconv for conversion purposes
                        $toexec = "/usr/local/bin/pdftotext  -raw -nopgbrk " . $uploadFormObj->mUploadTempName . " -";
                        // Alternative: $toexec = "/usr/bin/pdftotext  -raw -nopgbrk " . $uploadFormObj->mUploadTempName . " -";
                        $toexec.="| iconv -f ISO-8859-1 -t UTF-8";
                        break;
                }
 
                case "dot": {}
                case "doc": 
                {
                        //using antiword 
                        $toexec = "/usr/bin/antiword -s ".$uploadFormObj->mUploadTempName;
                        break;
                }
 
                case "xls":
                {
                        $toexec = "/usr/bin/xls2csv ".$uploadFormObj->mUploadTempName;
                        break;
                }
 
                case "ppt":
                {
                        $toexec = "/usr/bin/catppt ".$uploadFormObj->mUploadTempName; 
                        break;
                }
 
                case "rtf": # any file extension with text in it will be okay here
                {
                        $toexec = "/usr/bin/strings ".$uploadFormObj->mUploadTempName; # string's output isn't neat, but it works.
                        break;
                }
 
                //OpenOffice.org documents
                case "ods": {}
                case "odp": {}
                case "odg": {}
                case "odt":
                {
                        $toexec = "unzip -p " . $uploadFormObj->mUploadTempName . " content.xml";
                        $RemoveTags = true;
                        break;
                }
        }
        if ($toexec != "")
        {
                exec($toexec, $DocText);
                $NewDesc = $uploadFormObj->mUploadDescription . "\r\n" . "<!-- ";
                foreach ($DocText as $DocLine) 
                {
                        if($RemoveTags == false)
                        {
                                $NewDesc .= "\r\n" . str_replace("-->","",$DocLine);
                        }
                        else
                        {
                                $NewDesc .= "\r\n" . strip_tags(str_replace("-->","",$DocLine));
                        }
                }
                $NewDesc .= "\r\n" . " -->";
                $uploadFormObj->mUploadDescription = $NewDesc;
        }
        return $uploadFormObj;
}
 
function isCommandPresent($command)
{
        if(file_exists($command)==false)
        {
                //extract the command from the path
                $lastSlash = strrpos($command, '/');
                if($lastSlash!='')
                {
                        $commandWithoutSlashes = substr($command, $lastSlash+1);
                }
                else
                {
                        $commandWithoutSlashes = $command;
                }
 
                $toexec = "whereis $commandWithoutSlashes";
                //lookup the command
                exec($toexec, $whereis);
                echo "FileIndexer: The file $command is missing ... whereis result: $whereis[0] <br>";
        }
}
 
/**
  * Add extension information to Special:Version
 */
$wgExtensionCredits['other'][] = array(
        'name' => 'FileIndexer',
        'author' => 'MHart and Flominator',
        'description' => 'makes uploaded documents searchable',
        'url' => 'http://www.mediawiki.org/wiki/Extension:FileIndexer'
        );
Personal tools
Namespaces

Variants
Actions
Navigation
Support
Download
Development
Communication
Print/export
Toolbox