r43920 - Code Review

From MediaWiki.org

Jump to: navigation, search
Repository:MediaWiki
Revision:r43919 | r43920 (on ViewVC) | r43921 >
Date:02:39, 25 November 2008
Author:brion
Status:ok (Comments)
Tags:
Comment:* (bug 5477) Searches for words less than 4 characters now work without
requiring customization of MySQL server settings

Short words are padded so they now get indexed. Yay!

Adapted part of Werdna's patch, with some additional cleanup:
* Using 'U00' to pad instead of 'SMALL' to reduce false positives (eg search for "small*" could match "Smallville" and "SMALLc")
* Checking server's ft_min_word_len variable to see if we need to do anything. This preserves index compatibility with existing installations which have customized their index length.
* Some further cleanup on redundant code -- just toss everything through lc() and be done with it :D
* Cleaned out some more evals in zh and yue classes :P
* Fixed yue class to call the parent adjustor properly
Modified paths:

Diff [purge]

Index: trunk/phase3/languages/Language.php
===================================================================
--- trunk/phase3/languages/Language.php	(revision 43919)
+++ trunk/phase3/languages/Language.php	(revision 43920)
@@ -1523,25 +1523,61 @@
 			return $string;
 		}
 
-		# MySQL fulltext index doesn't grok utf-8, so we
-		# need to fold cases and convert to hex
 
 		wfProfileIn( __METHOD__ );
-		if( function_exists( 'mb_strtolower' ) ) {
+		
+		// MySQL fulltext index doesn't grok utf-8, so we
+		// need to fold cases and convert to hex
+		$out = preg_replace_callback(
+			"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+			array( $this, 'stripForSearchCallback' ),
+			$this->lc( $string ) );
+		
+		// And to add insult to injury, the default indexing
+		// ignores short words... Pad them so we can pass them
+		// through without reconfiguring the server...
+		$minLength = $this->minSearchLength();
+		if( $minLength > 1 ) {
+			$n = $minLength-1;
 			$out = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"'U8' . bin2hex( \"$1\" )",
-				mb_strtolower( $string ) );
-		} else {
-			list( , $wikiLowerChars ) = self::getCaseMaps();
-			$out = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-				$string );
+				"/\b(\w{1,$n})\b/",
+				"$1U800",
+				$out );
 		}
+		
 		wfProfileOut( __METHOD__ );
 		return $out;
 	}
+	
+	/**
+	 * Armor a case-folded UTF-8 string to get through MySQL's
+	 * fulltext search without being mucked up by funny charset
+	 * settings or anything else of the sort.
+	 */
+	protected function stripForSearchCallback( $matches ) {
+		return 'U8' . bin2hex( $matches[1] );
+	}
+	
+	/**
+	 * Check MySQL server's ft_min_word_len setting so we know
+	 * if we need to pad short words...
+	 */
+	protected function minSearchLength() {
+		if( !isset( $this->minSearchLength ) ) {
+			$sql = "show global variables like 'ft\\_min\\_word\\_len'";
+			$dbr = wfGetDB( DB_SLAVE );
+			$result = $dbr->query( $sql );
+			$row = $result->fetchObject();
+			$result->free();
+			
+			if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+				$this->minSearchLength = intval( $row->Value );
+			} else {
+				$this->minSearchLength = 0;
+			}
+		}
+		return $this->minSearchLength;
+	}
 
 	function convertForSearchResult( $termsArray ) {
 		# some languages, e.g. Chinese, need to do a conversion
Index: trunk/phase3/languages/classes/LanguageZh.php
===================================================================
--- trunk/phase3/languages/classes/LanguageZh.php	(revision 43919)
+++ trunk/phase3/languages/classes/LanguageZh.php	(revision 43920)
@@ -126,14 +126,14 @@
 
 	// word segmentation
 	function stripForSearch( $string ) {
-		$fname="LanguageZh::stripForSearch";
-		wfProfileIn( $fname );
+		wfProfileIn( __METHOD__ );
 
 		// eventually this should be a word segmentation
 		// for now just treat each character as a word
+		// @fixme only do this for Han characters...
 		$t = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"' ' .\"$1\"", $string);
+				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+				" $1", $string);
 
         //always convert to zh-hans before indexing. it should be
 		//better to use zh-hans for search, since conversion from
@@ -142,7 +142,7 @@
 
 		$t = $this->mConverter->autoConvert($t, 'zh-hans');
 		$t = parent::stripForSearch( $t );
-		wfProfileOut( $fname );
+		wfProfileOut( __METHOD__ );
 		return $t;
 
 	}
Index: trunk/phase3/languages/classes/LanguageYue.php
===================================================================
--- trunk/phase3/languages/classes/LanguageYue.php	(revision 43919)
+++ trunk/phase3/languages/classes/LanguageYue.php	(revision 43920)
@@ -4,20 +4,18 @@
  */
 class LanguageYue extends Language {
 	function stripForSearch( $string ) {
-		# MySQL fulltext index doesn't grok utf-8, so we
-		# need to fold cases and convert to hex
-		# we also separate characters as "words"
-		if( function_exists( 'mb_strtolower' ) ) {
-			return preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"' U8' . bin2hex( \"$1\" )",
-				mb_strtolower( $string ) );
-		} else {
-			list( , $wikiLowerChars ) = Language::getCaseMaps();
-			return preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-				$string );
-		}
+		wfProfileIn( __METHOD__ );
+
+		// eventually this should be a word segmentation
+		// for now just treat each character as a word
+		// @fixme only do this for Han characters...
+		$t = preg_replace(
+				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+				" $1", $string);
+
+		// Do general case folding and UTF-8 armoring
+		$t = parent::stripForSearch( $t );
+		wfProfileOut( __METHOD__ );
+		return $t;
 	}
 }
Index: trunk/phase3/RELEASE-NOTES
===================================================================
--- trunk/phase3/RELEASE-NOTES	(revision 43919)
+++ trunk/phase3/RELEASE-NOTES	(revision 43920)
@@ -363,6 +363,8 @@
 * Improved scripting safety heuristics on SVG uploads.
 * (bug 11728) Unify layout of enhanced watchlist/recent changes
 * (bug 8702) Properly update stats when running nukePage maintenance script
+* (bug 5477) Searches for words less than 4 characters now work without
+  requiring customization of MySQL server settings
 
 === API changes in 1.14 ===
 

Comments

#Comment by Brion VIBBER (Talk | contribs)   02:44, 25 November 2008

make that bug 7726 :P

#Comment by Simetrical (Talk | contribs)   14:15, 25 November 2008

Why not make the mangling for short word length be reversible, so there are no conflicts? E.g., if the word already ends in "U800", append another U800 to the indexed form. (This might increase the size of some large things to be over ft_*max*_word_len, though . . . no clever workarounds for that, huh? Maybe break it up into multiple words and search for all of them . . .)

Status & tagging log

Views
Toolbox