Index: library/Zend/Search/Lucene/Document/Html.php =================================================================== --- library/Zend/Search/Lucene/Document/Html.php (revision 20197) +++ library/Zend/Search/Lucene/Document/Html.php (working copy) @@ -68,6 +68,17 @@ private static $_excludeNoFollowLinks = false; /** + * + * List of inline tags + * + * @var array + */ + private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code', + 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike', + 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins', + 'q', 'sub', 'sup'); + + /** * Object constructor * * @param string $data HTML string (may be HTML fragment, ) @@ -197,8 +208,10 @@ private function _retrieveNodeText(DOMNode $node, &$text) { if ($node->nodeType == XML_TEXT_NODE) { - $text .= $node->nodeValue ; - $text .= ' '; + $text .= $node->nodeValue; + if(!in_array($node->parentNode->tagName, $this->_inlineTags)) { + $text .= ' '; + } } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { foreach ($node->childNodes as $childNode) { $this->_retrieveNodeText($childNode, $text); Index: tests/Zend/Search/Lucene/DocumentTest.php =================================================================== --- tests/Zend/Search/Lucene/DocumentTest.php (revision 20197) +++ tests/Zend/Search/Lucene/DocumentTest.php (working copy) @@ -55,6 +55,23 @@ */ class Zend_Search_Lucene_DocumentTest extends PHPUnit_Framework_TestCase { + + private function _clearDirectory($dirName) + { + if (!file_exists($dirName) || !is_dir($dirName)) { + return; + } + + // remove files from temporary direcytory + $dir = opendir($dirName); + while (($file = readdir($dir)) !== false) { + if (!is_dir($dirName . '/' . $file)) { + @unlink($dirName . '/' . $file); + } + } + closedir($dir); + } + public function testCreate() { $document = new Zend_Search_Lucene_Document(); @@ -171,6 +188,32 @@ 'contributing.html')); } + + /** + * @group ZF-4252 + */ + public function testHtmlInlineTagsIndexing() + { + $index = Zend_Search_Lucene::create(dirname(__FILE__) . '/_index/_files'); + + $htmlString = 'Hello World' + . 'ZendFramework' . "\n" . '
Foo
Bar ' . "\n" + . ' Test'; + + $doc = Zend_Search_Lucene_Document_Html::loadHTML($htmlString); + + $index->addDocument($doc); + + $hits = $index->find('FooBar'); + $this->assertEquals(count($hits), 0); + + $hits = $index->find('ZendFramework'); + $this->assertEquals(count($hits), 1); + + unset($index); + $this->_clearDirectory(dirname(__FILE__) . '/_index/_files'); + } + public function testHtmlNoFollowLinks() { $html = ''