. */ /** * Doctrine_Search_Analyzer_Utf8 * * This class is used to analyze (ie tokenize) an input $text in * $encoding encoding, and return an array of words to be indexed. * * @package Doctrine * @subpackage Search * @author Brice Figureau * @license http://www.opensource.org/licenses/lgpl-license.php LGPL * @version $Revision$ * @link www.phpdoctrine.org * @since 1.0 */ class Doctrine_Search_Analyzer_Utf8 extends Doctrine_Search_Analyzer_Standard { public function analyze($text, $encoding = null) { if (is_null($encoding)) { $encoding = isset($this->_options['encoding']) ? $this->_options['encoding']:'utf-8'; } // check that $text encoding is utf-8, if not convert it if (strcasecmp($encoding, 'utf-8') != 0 && strcasecmp($encoding, 'utf8') != 0) { $text = iconv($encoding, 'UTF-8', $text); } $text = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $text); $text = str_replace(' ', ' ', $text); $terms = explode(' ', $text); $ret = array(); if ( ! empty($terms)) { foreach ($terms as $i => $term) { if (empty($term)) { continue; } $lower = mb_strtolower(trim($term), 'UTF-8'); if (in_array($lower, self::$_stopwords)) { continue; } $ret[$i] = $lower; } } return $ret; } }