Extension:RSS Reader/wikiRSS
From MediaWiki.org
wikiRSS is an RSS parser based on lastRSS. It is designed to work with the RSS Reader extension, and offer an alternative to file system caching (for users that might not have access to their file system, or would prefer to use the database).
Contents |
[edit] Benchmarks
After creating and benchmarking, wikiRSS tends to perform around 3 times slower for fetching cached files. There might be slight optimizations possible to make it run faster that it does, but wikiRSS will never run faster than lastRSS. This is based on the simple fact that a database has to access files at the lowest level, whil direct file caching cuts the overhead of a database.
[edit] Installation
Installing wikiRSS instead of lastRSS is not a big hassle. Copy and paste dbconfig.sql and wikiRSS.php into files with the same names in the folder where RSSReader.php is located. Go inside dbconfig.sql and change <your DBprefix> in the first line to your MediaWiki's database prefix (as stored by $wgDBprefix). Then run dbconfig.sql on your MediaWiki database.
[edit] Code
[edit] wikiRSS.php
<?php /* * wikiRSS 0.1 - PHP class to parse RSS files for MediaWiki * Copyright (C) 2007 Artem Kaznatcheev * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation, either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see <http://www.gnu.org/licenses/>. ************************************************************************** * ACKNOWLEDGMENT * * This class is a modification of an existing RSS parser called: lastRSS * 0.9.1. lasRSS was created by Vojtech Semecky who can be contacted at * webmaster @ oslab . net. The original lastRSS "Simple yet powerful PHP * class to parse RSS files" was licensed under GPL, and this product * follows in its footsteps. Please visit http://lastrss.oslab.net/ for * more information about lastRSS ************************************************************************** * WIKIRSS INFO * * wikiRSS provides a way to retrieve and parse RSS feeds. For retrieving * and parsing it mostly relies on the code that made lastRSS work, however * instead of caching feeds in individual files in a folder, it sticks * them in the database ************************************************************************** */ # Not a valid entry point, skip unless MEDIAWIKI is defined if (!defined('MEDIAWIKI')) {exit( 1 );} //uses MediaWiki database /* * This is the wikiRSS class that does most the fetching and caching of RSS * feeds. Majority of it comes from lastRSS by Vojtech Semecky */ class wikiRSS { /* * Public properties */ var $default_cp = 'UTF-8'; var $CDATA = 'nochange'; var $cp = ''; var $items_limit = 0; var $stripHTML = False; var $date_format = ''; /* Private variables */ var $channeltags = array ( 'title', 'link', 'description', 'language', 'copyright', 'managingEditor', 'webMaster', 'lastBuildDate', 'rating', 'docs' ); var $itemtags = array( 'title', 'link', 'description', 'author', 'category', 'comments', 'enclosure', 'guid', 'pubDate', 'source' ); var $imagetags = array('title', 'url', 'link', 'width', 'height'); var $textinputtags = array('title', 'description', 'name', 'link'); /* * Parse RSS file and returns associative array. */ function Get ($rss_url) { global $wgDBprefix; //get DBpreix if ($this->cache) { //cache enabled $dbr =& wfGetDB( DB_SLAVE ); //load the database object $res = $dbr->query(" SELECT rss_content,rss_cacheTime FROM " .$wgDBprefix.'rss_cache WHERE rss_source="'.$rss_url.'"' ); //query for content and cacheTime of url $row = $dbr->fetchObject( $res ); $dbr->close(); //close database if (!$row) { //no entry yet $result = $this->Parse($rss_url); //fetch the result if ($result) $result['cached'] = 0; $serialized = str_replace('"','\"', serialize($result)); //serialize and escape result $dbw =& wfGetDB( DB_MASTER ); //load the database object $res = $dbw->query(' INSERT INTO '.$wgDBprefix.'rss_cache VALUES("'.$rss_url.'","'.$serialized.'",'.time().') '); $dbw->close(); //close database } else { //previous entry found if ((time() - $row->rss_cacheTime) < $this->cache_time) { //file is fresh enough $result = unserialize($row->rss_content); //unserialize old feed content if ($result) $result['cached'] = 1; // set 'cached' to 1 only if cached file is correct } else { //file is too old $result = $this->Parse($rss_url); //fetch the result if ($result) $result['cached'] = 0; $serialized = str_replace('"','\"', serialize($result)); //serialize and escape result $dbw =& wfGetDB( DB_MASTER ); //load the database object $res = $dbw->query(' UPDATE '.$wgDBprefix.'rss_cache SET rss_content="'.$serialized.'", rss_cacheTime='.time().' WHERE rss_source="'.$rss_url.'" '); $dbw->close(); //close database } } } else { //cache disabled $result = $this->Parse($rss_url); //fetch the result if ($result) $result['cached'] = 0; } return $result; // return result } /* * Modification of preg_match(); return trimed field with index 1 * from 'classic' preg_match() array output */ function my_preg_match ($pattern, $subject) { // start regullar expression preg_match($pattern, $subject, $out); // if there is some result... process it and return it if(isset($out[1])) { // Process CDATA (if present) if ($this->CDATA == 'content') { // Get CDATA content (without CDATA tag) $out[1] = strtr($out[1], array('<![CDATA['=>'', ']]>'=>'')); } elseif ($this->CDATA == 'strip') { // Strip CDATA $out[1] = strtr($out[1], array('<![CDATA['=>'', ']]>'=>'')); } // If code page is set convert character encoding to required if ($this->cp != '') //$out[1] = $this->MyConvertEncoding($this->rsscp, $this->cp, $out[1]); $out[1] = iconv($this->rsscp, $this->cp.'//TRANSLIT', $out[1]); // Return result return trim($out[1]); } else { return ''; // if there is NO result, return empty string } } /* * Replace HTML entities &something; by real characters */ function unhtmlentities ($string) { // Get HTML entities table $trans_tbl = get_html_translation_table (HTML_ENTITIES, ENT_QUOTES); // Flip keys<==>values $trans_tbl = array_flip ($trans_tbl); // Add support for ' entity (missing in HTML_ENTITIES) $trans_tbl += array(''' => "'"); // Replace entities by values return strtr ($string, $trans_tbl); } /* * Parse() is private method used by Get() to load and parse RSS file. * Don't use Parse() in your scripts - use Get($rss_file) instead. */ function Parse ($rss_url) { // Open and load RSS file if ($f = @fopen($rss_url, 'r')) { $rss_content = ''; while (!feof($f)) { $rss_content .= fgets($f, 4096); } fclose($f); // Parse document encoding $result['encoding'] = $this->my_preg_match("'encoding=[\'\"](.*?)[\'\"]'si", $rss_content); // if document codepage is specified, use it if ($result['encoding'] != '') { $this->rsscp = $result['encoding']; // This is used in my_preg_match() } // otherwise use the default codepage else { $this->rsscp = $this->default_cp; // This is used in my_preg_match() } // Parse CHANNEL info preg_match("'<channel.*?>(.*?)</channel>'si", $rss_content, $out_channel); foreach($this->channeltags as $channeltag) { $temp = $this->my_preg_match("'<$channeltag.*?>(.*?)</$channeltag>'si", $out_channel[1]); if ($temp != '') $result[$channeltag] = $temp; // Set only if not empty } // If date_format is specified and lastBuildDate is valid if ($this->date_format != '' && ($timestamp = strtotime($result['lastBuildDate'])) !==-1) { // convert lastBuildDate to specified date format $result['lastBuildDate'] = date($this->date_format, $timestamp); } // Parse TEXTINPUT info preg_match("'<textinput(|[^>]*[^/])>(.*?)</textinput>'si", $rss_content, $out_textinfo); /* This a little strange regexp means: * Look for tag <textinput> with or without any attributes, but skip * truncated version <textinput /> (it's not beggining tag) */ if (isset($out_textinfo[2])) { foreach($this->textinputtags as $textinputtag) { $temp = $this->my_preg_match("'<$textinputtag.*?>(.*?)</$textinputtag>'si", $out_textinfo[2]); if ($temp != '') $result['textinput_'.$textinputtag] = $temp; // Set only if not empty } } // Parse IMAGE info preg_match("'<image.*?>(.*?)</image>'si", $rss_content, $out_imageinfo); if (isset($out_imageinfo[1])) { foreach($this->imagetags as $imagetag) { $temp = $this->my_preg_match("'<$imagetag.*?>(.*?)</$imagetag>'si", $out_imageinfo[1]); if ($temp != '') $result['image_'.$imagetag] = $temp; // Set only if not empty } } // Parse ITEMS preg_match_all("'<item(| .*?)>(.*?)</item>'si", $rss_content, $items); $rss_items = $items[2]; $i = 0; $result['items'] = array(); // create array even if there are no items foreach($rss_items as $rss_item) { // If number of items is lower then limit: Parse one item if ($i < $this->items_limit || $this->items_limit == 0) { foreach($this->itemtags as $itemtag) { $temp = $this->my_preg_match("'<$itemtag.*?>(.*?)</$itemtag>'si", $rss_item); if ($temp != '') $result['items'][$i][$itemtag] = $temp; // Set only if not empty } // Strip HTML tags and other bullshit from DESCRIPTION if ($this->stripHTML && $result['items'][$i]['description']) $result['items'][$i]['description'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['description']))); // Strip HTML tags and other bullshit from TITLE if ($this->stripHTML && $result['items'][$i]['title']) $result['items'][$i]['title'] = strip_tags($this->unhtmlentities(strip_tags($result['items'][$i]['title']))); // If date_format is specified and pubDate is valid if ($this->date_format != '' && ($timestamp = strtotime($result['items'][$i]['pubDate'])) !==-1) { // convert pubDate to specified date format $result['items'][$i]['pubDate'] = date($this->date_format, $timestamp); } // Item counter $i++; } } $result['items_count'] = $i; return $result; } else { // Error in opening return False return False; } } }
[edit] dbConfig.sql
Make sure to replace <your DBprefix> in the first line with your MediaWiki's database prefix (as stored by $wgDBprefix)
CREATE TABLE IF NOT EXISTS <your DBprefix>rss_cache ( rss_source varchar(255), rss_content blob, rss_cacheTime int(10) UNSIGNED NOT NULL, PRIMARY KEY(rss_source) );
[edit] Future Improvements
- Try to make the script run faster