Index: trunk/phase3/includes/SearchMySQL.php
===================================================================
--- trunk/phase3/includes/SearchMySQL.php (revision 52337)
+++ trunk/phase3/includes/SearchMySQL.php (revision 52338)
@@ -48,45 +48,94 @@
$m = array();
if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
$filteredText, $m, PREG_SET_ORDER ) ) {
- foreach( $m as $terms ) {
+ foreach( $m as $bits ) {
+ @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits;
+
+ if( $nonQuoted != '' ) {
+ $term = $nonQuoted;
+ $quote = '';
+ } else {
+ $term = str_replace( '"', '', $term );
+ $quote = '"';
+ }
+
if( $searchon !== '' ) $searchon .= ' ';
- if( $this->strictMatching && ($terms[1] == '') ) {
- $terms[1] = '+';
+ if( $this->strictMatching && ($modifier == '') ) {
+ // If we leave this out, boolean op defaults to OR which is rarely helpful.
+ $modifier = '+';
}
- // Search terms in all variant forms, only
- // apply on wiki with LanguageConverter
- $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] );
- if( is_array( $temp_terms )) {
- $temp_terms = array_unique( array_values( $temp_terms ));
- foreach( $temp_terms as $t )
- $searchon .= $terms[1] . $wgContLang->stripForSearch( $t ) . ' ';
+
+ // Some languages such as Serbian store the input form in the search index,
+ // so we may need to search for matches in multiple writing system variants.
+ $convertedVariants = $wgContLang->autoConvertToAllVariants( $term );
+ if( is_array( $convertedVariants ) ) {
+ $variants = array_unique( array_values( $convertedVariants ) );
+ } else {
+ $variants = array( $term );
}
- else
- $searchon .= $terms[1] . $wgContLang->stripForSearch( $terms[2] );
- if( !empty( $terms[3] ) ) {
- // Match individual terms in result highlighting...
- $regexp = preg_quote( $terms[3], '/' );
- if( $terms[4] ) {
- $regexp = "\b$regexp"; // foo*
- } else {
- $regexp = "\b$regexp\b";
+
+ // The low-level search index does some processing on input to work
+ // around problems with minimum lengths and encoding in MySQL's
+ // fulltext engine.
+ // For Chinese this also inserts spaces between adjacent Han characters.
+ $strippedVariants = array_map(
+ array( $wgContLang, 'stripForSearch' ),
+ $variants );
+
+ // Some languages such as Chinese force all variants to a canonical
+ // form when stripping to the low-level search index, so to be sure
+ // let's check our variants list for unique items after stripping.
+ $strippedVariants = array_unique( $strippedVariants );
+
+ $searchon .= $modifier;
+ if( count( $strippedVariants) > 1 )
+ $searchon .= '(';
+ foreach( $strippedVariants as $stripped ) {
+ if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
+ // Hack for Chinese: we need to toss in quotes for
+ // multiple-character phrases since stripForSearch()
+ // added spaces between them to make word breaks.
+ $stripped = '"' . trim( $stripped ) . '"';
}
- } else {
- // Match the quoted term in result highlighting...
- $regexp = preg_quote( str_replace( '"', '', $terms[2] ), '/' );
+ $searchon .= "$quote$stripped$quote$wildcard ";
}
+ if( count( $strippedVariants) > 1 )
+ $searchon .= ')';
+
+ // Match individual terms or quoted phrase in result highlighting...
+ // Note that variants will be introduced in a later stage for highlighting!
+ $regexp = $this->regexTerm( $term, $wildcard );
$this->searchTerms[] = $regexp;
}
- wfDebug( "Would search with '$searchon'\n" );
- wfDebug( 'Match with /' . implode( '|', $this->searchTerms ) . "/\n" );
+ wfDebug( __METHOD__ . ": Would search with '$searchon'\n" );
+ wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" );
} else {
- wfDebug( "Can't understand search query '{$filteredText}'\n" );
+ wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" );
}
$searchon = $this->db->strencode( $searchon );
$field = $this->getIndexField( $fulltext );
return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) ";
}
+
+ function regexTerm( $string, $wildcard ) {
+ global $wgContLang;
+
+ $regex = preg_quote( $string, '/' );
+ if( $wgContLang->hasWordBreaks() ) {
+ if( $wildcard ) {
+ // Don't cut off the final bit!
+ $regex = "\b$regex";
+ } else {
+ $regex = "\b$regex\b";
+ }
+ } else {
+ // For Chinese, words may legitimately abut other words in the text literal.
+ // Don't add \b boundary checks... note this could cause false positives
+ // for latin chars.
+ }
+ return $regex;
+ }
public static function legalSearchChars() {
return "\"*" . parent::legalSearchChars();
Index: trunk/phase3/languages/Language.php
===================================================================
--- trunk/phase3/languages/Language.php (revision 52337)
+++ trunk/phase3/languages/Language.php (revision 52338)
@@ -1595,6 +1595,16 @@
}
/**
+ * Most writing systems use whitespace to break up words.
+ * Some languages such as Chinese don't conventionally do this,
+ * which requires special handling when breaking up words for
+ * searching etc.
+ */
+ function hasWordBreaks() {
+ return true;
+ }
+
+ /**
* Some languages have special punctuation to strip out
* or characters which need to be converted for MySQL's
* indexing to grok it correctly. Make such changes here.
Index: trunk/phase3/languages/classes/LanguageZh.php
===================================================================
--- trunk/phase3/languages/classes/LanguageZh.php (revision 52337)
+++ trunk/phase3/languages/classes/LanguageZh.php (revision 52338)
@@ -175,19 +175,12 @@
function stripForSearch( $string ) {
wfProfileIn( __METHOD__ );
- // eventually this should be a word segmentation
- // for now just treat each character as a word
- // @fixme only do this for Han characters...
- $t = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
- " $1", $string);
-
//always convert to zh-hans before indexing. it should be
//better to use zh-hans for search, since conversion from
//Traditional to Simplified is less ambiguous than the
//other way around
- $t = $this->mConverter->autoConvert($t, 'zh-hans');
+ $t = $this->mConverter->autoConvert( $string, 'zh-hans' );
$t = parent::stripForSearch( $t );
wfProfileOut( __METHOD__ );
return $t;
Index: trunk/phase3/languages/classes/LanguageZh_hans.php
===================================================================
--- trunk/phase3/languages/classes/LanguageZh_hans.php (revision 52337)
+++ trunk/phase3/languages/classes/LanguageZh_hans.php (revision 52338)
@@ -4,21 +4,23 @@
* @ingroup Language
*/
class LanguageZh_hans extends Language {
+ function hasWordBreaks() {
+ return false;
+ }
+
function stripForSearch( $string ) {
- # MySQL fulltext index doesn't grok utf-8, so we
- # need to fold cases and convert to hex
- # we also separate characters as "words"
- if( function_exists( 'mb_strtolower' ) ) {
- return preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' U8' . bin2hex( \"$1\" )",
- mb_strtolower( $string ) );
- } else {
- list( , $wikiLowerChars ) = Language::getCaseMaps();
- return preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
- $string );
- }
+ // Eventually this should be a word segmentation;
+ // for now just treat each character as a word.
+ //
+ // Note we put a space on both sides to cover cases
+ // where a number or Latin char follows a Han char.
+ //
+ // @fixme only do this for Han characters...
+ $t = preg_replace(
+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+ " $1 ", $string);
+ $t = preg_replace( '/ +/', ' ', $t );
+ $t = trim( $t );
+ return parent::stripForSearch( $t );
}
}
Index: trunk/phase3/RELEASE-NOTES
===================================================================
--- trunk/phase3/RELEASE-NOTES (revision 52337)
+++ trunk/phase3/RELEASE-NOTES (revision 52338)
@@ -206,6 +206,7 @@
via extensions not using the userCan hook and via $wgRevokePermissions now work.
* (bug 19157) createAndPromote error on bad password
* (bug 18768) Remove AdminSettings.php from MediaWiki core
+* (bug 8445) Multiple-character search terms are now handled properly for Chinese
== API changes in 1.16 ==