Index: Search/Lucene/TempIndex.php =================================================================== --- Search/Lucene/TempIndex.php (Revision 0) +++ Search/Lucene/TempIndex.php (Revision 0) @@ -0,0 +1,908 @@ +_documents); + } + + /** + * Returns one greater than the largest possible document number. + * This may be used to, e.g., determine how big to allocate a structure which will have + * an element for every document number in an index. + * + * @return integer + */ + public function maxDoc() + { + return $this->_docID; + } + + /** + * Returns the total number of non-deleted documents in this index. + * + * @return integer + */ + public function numDocs() + { + return sizeof($this->_documents); + } + + /** + * Checks, that document is deleted + * + * @param integer $id + * @return boolean + * @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range + */ + public function isDeleted($id) + { + if (isset($this->_documents[$id])) return false; + return true; + } + + /** + * Set default search field. + * + * Null means, that search is performed through all fields by default + * + * Default value is null + * + * @param string $fieldName + */ + public static function setDefaultSearchField($fieldName) + { + $this->_defaultSearchField = $fieldName; + } + + /** + * Get default search field. + * + * Null means, that search is performed through all fields by default + * + * @return string + */ + public static function getDefaultSearchField() + { + return $this->_defaultSearchField; + } + + /** + * Set result set limit. + * + * 0 (default) means no limit + * + * @param integer $limit + */ + public static function setResultSetLimit($limit) + { + self::$_resultSetLimit = $limit; + } + + /** + * Set result set limit. + * + * 0 means no limit + * + * @return integer + */ + public static function getResultSetLimit() + { + return self::$_resultSetLimit; + } + + /** + * Retrieve index maxBufferedDocs option + * + * maxBufferedDocs is a minimal number of documents required before + * the buffered in-memory documents are written into a new Segment + * + * Default value is 10 + * + * @return integer + */ + public function getMaxBufferedDocs() + { + return 0; + } + + /** + * Set index maxBufferedDocs option + * + * maxBufferedDocs is a minimal number of documents required before + * the buffered in-memory documents are written into a new Segment + * + * Default value is 10 + * + * @param integer $maxBufferedDocs + */ + public function setMaxBufferedDocs($maxBufferedDocs) + { + // has no function on temp index + } + + /** + * Retrieve index maxMergeDocs option + * + * maxMergeDocs is a largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX + * + * @return integer + */ + public function getMaxMergeDocs() + { + return 0; + } + + /** + * Set index maxMergeDocs option + * + * maxMergeDocs is a largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + * Default value is PHP_INT_MAX + * + * @param integer $maxMergeDocs + */ + public function setMaxMergeDocs($maxMergeDocs) + { + // has no function on temp index + } + + /** + * Retrieve index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @return integer + */ + public function getMergeFactor() + { + return 0; + } + + /** + * Set index mergeFactor option + * + * mergeFactor determines how often segment indices are merged by addDocument(). + * With smaller values, less RAM is used while indexing, + * and searches on unoptimized indices are faster, + * but indexing speed is slower. + * With larger values, more RAM is used during indexing, + * and while searches on unoptimized indices are slower, + * indexing is faster. + * Thus larger values (> 10) are best for batch index creation, + * and smaller values (< 10) for indices that are interactively maintained. + * + * Default value is 10 + * + * @param integer $maxMergeDocs + */ + public function setMergeFactor($mergeFactor) + { + // has no function on temp index + } + + /** + * Performs a query against the index and returns an array + * of Zend_Search_Lucene_Search_QueryHit objects. + * Input is a string or Zend_Search_Lucene_Search_Query. + * + * 100% identical to Zend_Search_Lucene->find() + * + * @param mixed $query + * @return array Zend_Search_Lucene_Search_QueryHit + * @throws Zend_Search_Lucene_Exception + */ + public function find($query) + { + if (is_string($query)) { + $query = Zend_Search_Lucene_Search_QueryParser::parse($query); + } + + if (!$query instanceof Zend_Search_Lucene_Search_Query) { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object'); + } + + $this->commit(); + + $hits = array(); + $scores = array(); + $ids = array(); + + $query = $query->rewrite($this)->optimize($this); + + $query->execute($this); + + $topScore = 0; + foreach ($query->matchedDocs() as $id => $num) { + $docScore = $query->score($id, $this); + if( $docScore != 0 ) { + $hit = new Zend_Search_Lucene_Search_QueryHit($this); + $hit->id = $id; + $hit->score = $docScore; + + $hits[] = $hit; + $ids[] = $id; + $scores[] = $docScore; + + if ($docScore > $topScore) { + $topScore = $docScore; + } + } + + if (self::$_resultSetLimit != 0 && count($hits) >= self::$_resultSetLimit) { + break; + } + } + + if (count($hits) == 0) { + // skip sorting, which may cause a error on empty index + return array(); + } + + if ($topScore > 1) { + foreach ($hits as $hit) { + $hit->score /= $topScore; + } + } + + if (func_num_args() == 1) { + // sort by scores + array_multisort($scores, SORT_DESC, SORT_NUMERIC, + $ids, SORT_ASC, SORT_NUMERIC, + $hits); + } else { + // sort by given field names + + $argList = func_get_args(); + $fieldNames = $this->getFieldNames(); + $sortArgs = array(); + + // PHP 5.3 now expects all arguments to array_multisort be passed by + // reference (if it's invoked through call_user_func_array()); + // since constants can't be passed by reference, create some placeholder variables. + $sortReg = SORT_REGULAR; + $sortAsc = SORT_ASC; + $sortNum = SORT_NUMERIC; + + require_once 'Zend/Search/Lucene/Exception.php'; + for ($count = 1; $count < count($argList); $count++) { + $fieldName = $argList[$count]; + + if (!is_string($fieldName)) { + throw new Zend_Search_Lucene_Exception('Field name must be a string.'); + } + + if (!in_array($fieldName, $fieldNames)) { + throw new Zend_Search_Lucene_Exception('Wrong field name.'); + } + + $valuesArray = array(); + foreach ($hits as $hit) { + try { + $value = $hit->getDocument()->getFieldValue($fieldName); + } catch (Zend_Search_Lucene_Exception $e) { + if (strpos($e->getMessage(), 'not found') === false) { + throw $e; + } else { + $value = null; + } + } + + $valuesArray[] = $value; + } + + $sortArgs[] = &$valuesArray; + + if ($count + 1 < count($argList) && is_integer($argList[$count+1])) { + $count++; + $sortArgs[] = &$argList[$count]; + + if ($count + 1 < count($argList) && is_integer($argList[$count+1])) { + $count++; + $sortArgs[] = &$argList[$count]; + } else { + if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) { + $sortArgs[] = &$sortReg; + } else { + $sortArgs[] = &$sortAsc; + } + } + } else { + $sortArgs[] = &$sortAsc; + $sortArgs[] = &$sortReg; + } + } + + // Sort by id's if values are equal + $sortArgs[] = &$ids; + $sortArgs[] = &$sortAsc; + $sortArgs[] = &$sortNum; + + // Array to be sorted + $sortArgs[] = &$hits; + + // Do sort + call_user_func_array('array_multisort', $sortArgs); + } + + return $hits; + } + + /** + * Returns a list of all unique field names that exist in this index. + * + * @param boolean $indexed + * @return array + */ + public function getFieldNames($indexed = false) + { + return array_keys($this->_fields); + } + + /** + * Returns a Zend_Search_Lucene_Document object for the document + * number $id in this index. + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @return Zend_Search_Lucene_Document + */ + public function getDocument($id) + { + return $this->_documents[$id]; + } + + /** + * Returns true if index contain documents with specified term. + * + * Is used for query optimization. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return boolean + */ + public function hasTerm(Zend_Search_Lucene_Index_Term $term) + { + if (isset($this->_terms[$term->field][$term->text])) { + return true; + } + return false; + } + + /** + * Returns IDs of all the documents containing term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter + * @return array + */ + public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null) + { + if ($docsFilter != null) { + $allowed = array(); + foreach($docsFilter->segmentFilters as $seg) { + $allowed = array_merge(array_keys($seg),$allowed); + } + $result = $this->termDocs($term); + return array_intersect($result,$allowed); + } else { + if (isset($this->_termDocs[$term->field][$term->text])) { + return array_keys($this->_termDocs[$term->field][$term->text]); + } else { + return array(); + } + } + } + + /** + * Returns documents filter for all documents containing term. + * + * It performs the same operation as termDocs, but return result as + * Zend_Search_Lucene_Index_DocsFilter object + * + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter + * @return Zend_Search_Lucene_Index_DocsFilter + */ + public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null) + { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('DocsFilter is unimplemented in TempIndex. This function is assumed to be buggy in Lucene.php (it returns an array instead of a DocsFilter object), and not used anywhere in ZEND code. Specification is unclear.'); + } + + /** + * Returns an array of all term freqs. + * Return array structure: array( docId => freq, ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter + * @return integer + */ + public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null) + { + if ($docsFilter != null) { + $allowed = array(); + foreach($docsFilter->segmentFilters as $seg) { + $allowed = array_merge($seg,$allowed); + } + $result = $this->termFreqs($term); + return array_intersect_key($result,$allowed); + } else { + if (!isset($this->_termDocs[$term->field][$term->text])) { + return array(); + } else { + return $this->_termDocs[$term->field][$term->text]; + } + } + } + + /** + * Returns an array of all term positions in the documents. + * Return array structure: array( docId => array( pos1, pos2, ...), ...) + * + * @param Zend_Search_Lucene_Index_Term $term + * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter + * @return array + */ + public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null) + { + if ($docsFilter != null) { + $allowed = array(); + foreach($docsFilter->segmentFilters as $seg) { + $allowed = array_merge($seg,$allowed); + } + $result = $this->termPositions($term); + return array_intersect_key($result,$allowed); + } else { + if (!isset($this->_termPositions[$term->field][$term->text])) { + return array(); + } else { + return $this->_termPositions[$term->field][$term->text]; + } + } + + } + + /** + * Returns the number of documents in this index containing the $term. + * + * @param Zend_Search_Lucene_Index_Term $term + * @return integer + */ + public function docFreq(Zend_Search_Lucene_Index_Term $term) + { + return sizeof($this->_termDocs[$term->field][$term->text]); + } + + /** + * Retrive similarity used by index reader + * + * @return Zend_Search_Lucene_Search_Similarity + */ + public function getSimilarity() + { + return Zend_Search_Lucene_Search_Similarity::getDefault(); + } + + /** + * Returns a normalization factor for "field, document" pair. + * + * @param integer $id + * @param string $fieldName + * @return float + */ + public function norm($id, $fieldName) + { + return $this->_norms[$fieldName][$id]; + } + + /** + * Returns true if any documents have been deleted from this index. + * + * @return boolean + */ + public function hasDeletions() + { + return false; + } + + /** + * Deletes a document from the index. + * $id is an internal document id + * + * @param integer|Zend_Search_Lucene_Search_QueryHit $id + * @throws Zend_Search_Lucene_Exception + */ + public function delete($id) + { + unset($this->_documents[$id]); + } + + /** + * Adds a document to this index. + * + * @param Zend_Search_Lucene_Document $document + */ + public function addDocument(Zend_Search_Lucene_Document $document) + { + $this->_documents[$this->_docID] = $document; + + // parse document + $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); + $fieldNames = $document->getFieldnames(); + foreach ($fieldNames as $fieldName) { + + $field = $document->getField($fieldName); + + // tokenize if requested + if ($field->isTokenized) { + $tokens = $analyzer->tokenize($field->getUtf8Value(), 'UTF-8'); + } else { + $tokens = array( new Zend_Search_Lucene_Analysis_Token($field->getUtf8Value(),0,strlen(utf8_decode($field->getUtf8Value()))) ); + } + + // store tokens in "index" + $position=-1; + + foreach( $tokens as $token ) { + $text = $token->getTermText(); + $term = new Zend_Search_Lucene_Index_Term($text,$fieldName); + + $position += $token->getPositionIncrement(); + + // build an ordered array (list) of terms for each field + if (isset($this->_terms[$fieldName])) { + + // if the term is not set already, sort it in + if (!isset($this->_terms[$fieldName][$text])) { + $new=array(); + while (($current = array_shift($this->_terms[$fieldName])) && $text > $current->text) { + $new[$current->text] = $current; + } + $new[$text] = $term; + if ($current) $new[$current->text] = $current; + $this->_terms[$fieldName] = array_merge($new,$this->_terms[$fieldName]); + } + } else { + // first terms in each field are just stored + $this->_terms[$fieldName][$text] = $term; + } + + // store termPosition for this term + $this->_termPositions[$fieldName][$text][$this->_docID][] = $position; + + // store or increase term freq for this document + if (!isset($this->_termDocs[$fieldName][$text][$this->_docID])) { + $this->_termDocs[$fieldName][$text][$this->_docID] = 1; + } else { + $this->_termDocs[$fieldName][$text][$this->_docID]++; + } + + } + + // remember fieldname and document + $this->_fields[$fieldName][$this->_docID] = 1; + + // calculate and store normalisation vector + $this->_norms[$fieldName][$this->_docID] = $this->getSimilarity()->lengthNorm($fieldName,sizeof($tokens))*$document->boost*$field->boost; + } + + // increase docID + $this->_docID++; + } + + /** + * Commit changes resulting from delete() or undeleteAll() operations. + */ + public function commit() + { + // cannot commit anything in memory + } + + /** + * Optimize index. + * + * Merges all segments into one + */ + public function optimize() + { + // cannot optimize anything in memory + } + + /** + * Returns an array of all terms in this index. + * + * @return array + */ + public function terms() + { + $result=array(); + foreach ($this->_terms as $fieldarray) { + $result=array_merge($result,array_values($fieldarray)); + } + return $result; + } + + /** + * Undeletes all documents currently marked as deleted in this index. + */ + public function undeleteAll() + { + // objects in memory can not be undeleted + } + + + /** + * Add reference to the index object + * + * @internal + */ + public function addReference() + { + // has no function on temp index + } + + /** + * Remove reference from the index object + * + * When reference count becomes zero, index is closed and resources are cleaned up + * + * @internal + */ + public function removeReference() + { + // has no function on temp index + } + + /** + * Reset terms stream. + */ + public function resetTermsStream(){ + reset($this->_terms); + } + + /** + * Skip terms stream up to specified term prefix. + * + * Prefix contains fully specified field info and portion of searched term + * + * @param Zend_Search_Lucene_Index_Term $prefix + */ + public function skipTo(Zend_Search_Lucene_Index_Term $prefix){ + $field = $prefix->field; + $text = $prefix->text; + if (!$this->_terms[$field]) return; + $this->_currentField = $field; + reset($this->_terms[$field]); + if (current($this->_terms[$field])->text < $text) { + while (current($this->_terms[$field]) && current($this->_terms[$field])->text < $text) { + next($this->_terms[$field]); + }; + } + } + + /** + * Scans terms dictionary and returns next term + * + * @return Zend_Search_Lucene_Index_Term|null + */ + public function nextTerm(){ + if ( !isset($this->_terms[$this->_currentField]) ) return null; + return next($this->_terms[$this->_currentField]); + } + + /** + * Returns term in current position + * + * @return Zend_Search_Lucene_Index_Term|null + */ + public function currentTerm(){ + if ( !isset($this->_terms[$this->_currentField]) ) return null; + return current($this->_terms[$this->_currentField]); + } + + /** + * Close terms stream + * + * Should be used for resources clean up if stream is not read up to the end + */ + public function closeTermsStream(){ + // has no function on temp index + } + +} + Index: Search/Lucene/Search/Query.php =================================================================== --- Search/Lucene/Search/Query.php (Revision 17920) +++ Search/Lucene/Search/Query.php (Arbeitskopie) @@ -29,6 +29,8 @@ /** Zend_Search_Lucene_Search_Highlighter_Default */ require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php'; +/** Zend_Search_Lucene_TempIndex */ +require_once 'Zend/Search/Lucene/TempIndex.php'; /** * @category Zend @@ -178,21 +180,15 @@ abstract public function getQueryTerms(); /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - abstract protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter); - - /** * Highlight matches in $inputHTML * * @param string $inputHTML * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. * @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter + * @param Zend_Search_Lucene_Interface|null $index * @return string */ - public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null) + public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null, $index = null) { if ($highlighter === null) { $highlighter = new Zend_Search_Lucene_Search_Highlighter_Default(); @@ -201,8 +197,16 @@ $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML, false, $defaultEncoding); $highlighter->setDocument($doc); - $this->_highlightMatches($highlighter); + if ($index == null) { + $index = new Zend_Search_Lucene_TempIndex(); + $index->addDocument($doc); + } + $terms = $this->rewrite($index)->getQueryTerms(); + foreach ($terms as $term) { + $highlighter->highlight($term->text); + } + return $doc->getHTML(); } @@ -212,9 +216,10 @@ * @param string $inputHtmlFragment * @param string $encoding Input HTML string encoding * @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter + * @param Zend_Search_Lucene_Interface|null $index * @return string */ - public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null) + public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null, $index = null) { if ($highlighter === null) { $highlighter = new Zend_Search_Lucene_Search_Highlighter_Default(); @@ -226,9 +231,18 @@ $doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML); $highlighter->setDocument($doc); - $this->_highlightMatches($highlighter); + if ($index == null) { + $index = new Zend_Search_Lucene_TempIndex(); + $index->addDocument($doc); + } + $terms = $this->rewrite($index)->getQueryTerms(); + foreach ($terms as $term) { + $highlighter->highlight($term->text); + } + return $doc->getHtmlBody(); } + } Index: Search/Lucene/Search/Query/Wildcard.php =================================================================== --- Search/Lucene/Search/Query/Wildcard.php (Revision 17920) +++ Search/Lucene/Search/Query/Wildcard.php (Arbeitskopie) @@ -299,33 +299,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - $words = array(); - - $matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/'; - if (@preg_match('/\pL/u', 'a') == 1) { - // PCRE unicode support is turned on - // add Unicode modifier to the match expression - $matchExpression .= 'u'; - } - - $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); - foreach ($tokens as $token) { - if (preg_match($matchExpression, $token->getTermText()) === 1) { - $words[] = $token->getTermText(); - } - } - - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Range.php =================================================================== --- Search/Lucene/Search/Query/Range.php (Revision 17920) +++ Search/Lucene/Search/Query/Range.php (Arbeitskopie) @@ -319,42 +319,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - $words = array(); - - $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); - - $lowerTermText = ($this->_lowerTerm !== null)? $this->_lowerTerm->text : null; - $upperTermText = ($this->_upperTerm !== null)? $this->_upperTerm->text : null; - - if ($this->_inclusive) { - foreach ($tokens as $token) { - $termText = $token->getTermText(); - if (($lowerTermText == null || $lowerTermText <= $termText) && - ($upperTermText == null || $termText <= $upperTermText)) { - $words[] = $termText; - } - } - } else { - foreach ($tokens as $token) { - $termText = $token->getTermText(); - if (($lowerTermText == null || $lowerTermText < $termText) && - ($upperTermText == null || $termText < $upperTermText)) { - $words[] = $termText; - } - } - } - - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Preprocessing/Phrase.php =================================================================== --- Search/Lucene/Search/Query/Preprocessing/Phrase.php (Revision 17920) +++ Search/Lucene/Search/Query/Preprocessing/Phrase.php (Arbeitskopie) @@ -212,40 +212,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ - - /** Skip exact term matching recognition, keyword fields highlighting is not supported */ - - /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */ - - // tokenize phrase using current analyzer and process it as a phrase query - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); - - if (count($tokens) == 0) { - // Do nothing - return; - } - - if (count($tokens) == 1) { - $highlighter->highlight($tokens[0]->getTermText()); - return; - } - - //It's non-trivial phrase query - $words = array(); - foreach ($tokens as $token) { - $words[] = $token->getTermText(); - } - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Preprocessing/Fuzzy.php =================================================================== --- Search/Lucene/Search/Query/Preprocessing/Fuzzy.php (Revision 17920) +++ Search/Lucene/Search/Query/Preprocessing/Fuzzy.php (Arbeitskopie) @@ -218,51 +218,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ - - /** Skip exact term matching recognition, keyword fields highlighting is not supported */ - - // ------------------------------------- - // Recognize wildcard queries - - /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ - if (@preg_match('/\pL/u', 'a') == 1) { - $subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word)); - } else { - $subPatterns = preg_split('/[*?]/', $this->_word); - } - if (count($subPatterns) > 1) { - // Do nothing - return; - } - - // ------------------------------------- - // Recognize one-term multi-term and "insignificant" queries - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); - if (count($tokens) == 0) { - // Do nothing - return; - } - if (count($tokens) == 1) { - $term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field); - $query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity); - - $query->_highlightMatches($highlighter); - return; - } - - // Word is tokenized into several tokens - // But fuzzy search is supported only for non-multiple word terms - // Do nothing - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Preprocessing/Term.php =================================================================== --- Search/Lucene/Search/Query/Preprocessing/Term.php (Revision 17920) +++ Search/Lucene/Search/Query/Preprocessing/Term.php (Arbeitskopie) @@ -236,81 +236,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ - - /** Skip exact term matching recognition, keyword fields highlighting is not supported */ - - // ------------------------------------- - // Recognize wildcard queries - /** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */ - if (@preg_match('/\pL/u', 'a') == 1) { - $word = iconv($this->_encoding, 'UTF-8', $this->_word); - $wildcardsPattern = '/[*?]/u'; - $subPatternsEncoding = 'UTF-8'; - } else { - $word = $this->_word; - $wildcardsPattern = '/[*?]/'; - $subPatternsEncoding = $this->_encoding; - } - $subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE); - if (count($subPatterns) > 1) { - // Wildcard query is recognized - - $pattern = ''; - - foreach ($subPatterns as $id => $subPattern) { - // Append corresponding wildcard character to the pattern before each sub-pattern (except first) - if ($id != 0) { - $pattern .= $word[ $subPattern[1] - 1 ]; - } - - // Check if each subputtern is a single word in terms of current analyzer - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding); - if (count($tokens) > 1) { - // Do nothing (nothing is highlighted) - return; - } - foreach ($tokens as $token) { - $pattern .= $token->getTermText(); - } - } - - $term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field); - $query = new Zend_Search_Lucene_Search_Query_Wildcard($term); - - $query->_highlightMatches($highlighter); - return; - } - - // ------------------------------------- - // Recognize one-term multi-term and "insignificant" queries - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding); - - if (count($tokens) == 0) { - // Do nothing - return; - } - - if (count($tokens) == 1) { - $highlighter->highlight($tokens[0]->getTermText()); - return; - } - - //It's not insignificant or one term query - $words = array(); - foreach ($tokens as $token) { - $words[] = $token->getTermText(); - } - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Empty.php =================================================================== --- Search/Lucene/Search/Query/Empty.php (Revision 17920) +++ Search/Lucene/Search/Query/Empty.php (Arbeitskopie) @@ -118,16 +118,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - // Do nothing - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Boolean.php =================================================================== --- Search/Lucene/Search/Query/Boolean.php (Revision 17920) +++ Search/Lucene/Search/Query/Boolean.php (Arbeitskopie) @@ -758,20 +758,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - foreach ($this->_subqueries as $id => $subquery) { - if ($this->_signs === null || $this->_signs[$id] !== false) { - $subquery->_highlightMatches($highlighter); - } - } - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/MultiTerm.php =================================================================== --- Search/Lucene/Search/Query/MultiTerm.php (Revision 17920) +++ Search/Lucene/Search/Query/MultiTerm.php (Arbeitskopie) @@ -600,30 +600,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - $words = array(); - - if ($this->_signs === null) { - foreach ($this->_terms as $term) { - $words[] = $term->text; - } - } else { - foreach ($this->_signs as $id => $sign) { - if ($sign !== false) { - $words[] = $this->_terms[$id]->text; - } - } - } - - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Phrase.php =================================================================== --- Search/Lucene/Search/Query/Phrase.php (Revision 17920) +++ Search/Lucene/Search/Query/Phrase.php (Arbeitskopie) @@ -518,21 +518,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - $words = array(); - foreach ($this->_terms as $term) { - $words[] = $term->text; - } - - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Insignificant.php =================================================================== --- Search/Lucene/Search/Query/Insignificant.php (Revision 17920) +++ Search/Lucene/Search/Query/Insignificant.php (Arbeitskopie) @@ -119,16 +119,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - // Do nothing - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Fuzzy.php =================================================================== --- Search/Lucene/Search/Query/Fuzzy.php (Revision 17920) +++ Search/Lucene/Search/Query/Fuzzy.php (Arbeitskopie) @@ -410,68 +410,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - $words = array(); - - $prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength); - $prefixByteLength = strlen($prefix); - $prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix); - - $termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text); - - $termRest = substr($this->_term->text, $prefixByteLength); - // we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible - $termRestLength = strlen($termRest); - - $scaleFactor = 1/(1 - $this->_minimumSimilarity); - - - $docBody = $highlighter->getDocument()->getFieldUtf8Value('body'); - $tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8'); - foreach ($tokens as $token) { - $termText = $token->getTermText(); - - if (substr($termText, 0, $prefixByteLength) == $prefix) { - // Calculate similarity - $target = substr($termText, $prefixByteLength); - - $maxDistance = isset($this->_maxDistances[strlen($target)])? - $this->_maxDistances[strlen($target)] : - $this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target)); - - if ($termRestLength == 0) { - // we don't have anything to compare. That means if we just add - // the letters for current term we get the new word - $similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length); - } else if (strlen($target) == 0) { - $similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length); - } else if ($maxDistance < abs($termRestLength - strlen($target))){ - //just adding the characters of term to target or vice-versa results in too many edits - //for example "pre" length is 3 and "prefixes" length is 8. We can see that - //given this optimal circumstance, the edit distance cannot be less than 5. - //which is 8-3 or more precisesly abs(3-8). - //if our maximum edit distance is 4, then we can discard this word - //without looking at it. - $similarity = 0; - } else { - $similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target))); - } - - if ($similarity > $this->_minimumSimilarity) { - $words[] = $termText; - } - } - } - - $highlighter->highlight($words); - } - - /** * Print a query * * @return string Index: Search/Lucene/Search/Query/Term.php =================================================================== --- Search/Lucene/Search/Query/Term.php (Revision 17920) +++ Search/Lucene/Search/Query/Term.php (Arbeitskopie) @@ -192,16 +192,6 @@ } /** - * Query specific matches highlighting - * - * @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting) - */ - protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter) - { - $highlighter->highlight($this->_term->text); - } - - /** * Print a query * * @return string