Make content handlers assemble content for search

author Stanislav Malyshev <smalyshev@gmail.com>

Mon, 16 May 2016 20:24:10 +0000 (13:24 -0700)

committer Stanislav Malyshev <smalyshev@gmail.com>

Tue, 26 Jul 2016 20:08:45 +0000 (13:08 -0700)
author Stanislav Malyshev <smalyshev@gmail.com>
Mon, 16 May 2016 20:24:10 +0000 (13:24 -0700)
committer Stanislav Malyshev <smalyshev@gmail.com>
Tue, 26 Jul 2016 20:08:45 +0000 (13:08 -0700)
diff --git a/autoload.php b/autoload.php

index 5808040..6aab98d 100644 (file)
--- a/autoload.php
+++ b/autoload.php
@@ -1539,6 +1539,7 @@ $wgAutoloadLocalClasses = [
         'WikiReference' => __DIR__ . '/includes/WikiMap.php',
         'WikiRevision' => __DIR__ . '/includes/import/WikiRevision.php',
         'WikiStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php',
+       'WikiTextStructure' => __DIR__ . '/includes/content/WikiTextStructure.php',
         'WikitextContent' => __DIR__ . '/includes/content/WikitextContent.php',
         'WikitextContentHandler' => __DIR__ . '/includes/content/WikitextContentHandler.php',
         'WinCacheBagOStuff' => __DIR__ . '/includes/libs/objectcache/WinCacheBagOStuff.php',
diff --git a/docs/hooks.txt b/docs/hooks.txt

index 57240c9..7396fe1 100644 (file)
--- a/docs/hooks.txt
+++ b/docs/hooks.txt
@@ -2620,6 +2620,18 @@ search results.
  $title: Current Title object being displayed in search results.
  &$id: Revision ID (default is false, for latest)
  
+'SearchIndexFields': Add fields to search index mapping.
+&$fields: Array of fields, all implement SearchIndexField
+$engine: SearchEngine instance for which mapping is being built.
+
+'SearchDataForIndex': Add data to search document. Allows to add any data to
+the field map used to index the document.
+&$fields: Array of name => value pairs for fields
+$handler: ContentHandler for the content being indexed
+$page: WikiPage that is being indexed
+$output: ParserOutput that is produced from the page
+$engine: SearchEngine for which the indexing is intended
+
  'SecondaryDataUpdates': Allows modification of the list of DataUpdates to
  perform when page content is modified. Currently called by
  AbstractContent::getSecondaryDataUpdates.
diff --git a/includes/content/ContentHandler.php b/includes/content/ContentHandler.php

index 1ecd614..7184980 100644 (file)
--- a/includes/content/ContentHandler.php
+++ b/includes/content/ContentHandler.php
@@ -1270,4 +1270,69 @@ abstract class ContentHandler {
                  */
                 return [];
         }
+
+       /**
+        * Add new field definition to array.
+        * @param SearchIndexField[] $fields
+        * @param SearchEngine       $engine
+        * @param string             $name
+        * @param int                $type
+        * @return SearchIndexField[] new field defs
+        * @since 1.28
+        */
+       protected function addSearchField( &$fields, SearchEngine $engine, $name, $type ) {
+               $fields[$name] = $engine->makeSearchFieldMapping( $name, $type );
+               return $fields;
+       }
+
+       /**
+        * Return fields to be indexed by search engine
+        * as representation of this document.
+        * Overriding class should call parent function or take care of calling
+        * the SearchDataForIndex hook.
+        * @param WikiPage     $page Page to index
+        * @param ParserOutput $output
+        * @param SearchEngine $engine Search engine for which we are indexing
+        * @return array Map of name=>value for fields
+        * @since 1.28
+        */
+       public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
+                                              SearchEngine $engine ) {
+               $fields = [];
+               $content = $page->getContent();
+               if ( $content ) {
+                       $text = $content->getTextForSearchIndex();
+                       $fields['text'] = $text;
+                       $fields['source_text'] = $text;
+                       $fields['text_bytes'] = $content->getSize();
+               }
+               Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] );
+               return $fields;
+       }
+
+       /**
+        * Produce page output suitable for indexing.
+        *
+        * Specific content handlers may override it if they need different content handling.
+        *
+        * @param WikiPage    $page
+        * @param ParserCache $cache
+        * @return ParserOutput
+        */
+       public function getParserOutputForIndexing( WikiPage $page, ParserCache $cache = null ) {
+               $parserOptions = $page->makeParserOptions( 'canonical' );
+               $revId = $page->getRevision()->getId();
+               if ( $cache ) {
+                       $parserOutput = $cache->get( $page, $parserOptions );
+               }
+               if ( empty( $parserOutput ) ) {
+                       $parserOutput =
+                               $page->getContent()->getParserOutput( $page->getTitle(), $revId, $parserOptions );
+                       if ( $cache ) {
+                               $cache->save( $parserOutput, $page, $parserOptions );
+                       }
+               }
+               return $parserOutput;
+       }
+
  }
diff --git a/includes/content/TextContentHandler.php b/includes/content/TextContentHandler.php

index 748c810..d4fad44 100644 (file)
--- a/includes/content/TextContentHandler.php
+++ b/includes/content/TextContentHandler.php
@@ -148,4 +148,13 @@ class TextContentHandler extends ContentHandler {
                         $engine->makeSearchFieldMapping( 'language', SearchIndexField::INDEX_TYPE_KEYWORD );
                 return $fields;
         }
+
+       public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
+                                              SearchEngine $engine ) {
+               $fields = parent::getDataForSearchIndex( $page, $output, $engine );
+               $fields['language'] =
+                       $this->getPageLanguage( $page->getTitle(), $page->getContent() )->getCode();
+               return $fields;
+       }
+
  }
diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php

new file mode 100644 (file)

index 0000000..d4ba8a1
--- /dev/null
+++ b/includes/content/WikiTextStructure.php
@@ -0,0 +1,277 @@
+<?php
+
+use HtmlFormatter\HtmlFormatter;
+use MediaWiki\Logger\LoggerFactory;
+
+/**
+ * Class allowing to explore structure of parsed wikitext.
+ */
+class WikiTextStructure {
+       /**
+        * @var string
+        */
+       private $openingText;
+       /**
+        * @var string
+        */
+       private $allText;
+       /**
+        * @var string[]
+        */
+       private $auxText = [];
+       /**
+        * @var ParserOutput
+        */
+       private $parserOutput;
+
+       /**
+        * @var string[] selectors to elements that are excluded entirely from search
+        */
+       private $excludedElementSelectors = [
+               'audio', 'video',       // "it looks like you don't have javascript enabled..."
+                                       // do not need to index
+               'sup.reference',        // The [1] for references
+               '.mw-cite-backlink',    // The ↑ next to references in the references section
+               'h1', 'h2', 'h3',       // Headings are already indexed in their own field.
+               'h5', 'h6', 'h4',
+               '.autocollapse',        // Collapsed fields are hidden by default so we don't want them
+                                                               // showing up.
+       ];
+
+       /**
+        * @var string[] selectors to elements that are considered auxiliary to article text for search
+        */
+       private $auxiliaryElementSelectors = [
+               '.thumbcaption',        // Thumbnail captions aren't really part of the text proper
+               'table',                // Neither are tables
+               '.rellink',             // Common style for "See also:".
+               '.dablink',             // Common style for calling out helpful links at the top
+                                                               // of the article.
+               '.searchaux',           // New class users can use to mark stuff as auxiliary to searches.
+       ];
+
+       /**
+        * WikiTextStructure constructor.
+        * @param ParserOutput $parserOutput
+        */
+       public function __construct( ParserOutput $parserOutput ) {
+               $this->parserOutput = $parserOutput;
+       }
+
+       /**
+        * Get categories in the text.
+        * @return string[]
+        */
+       public function categories() {
+               $categories = [];
+               foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
+                       $categories[] = Category::newFromName( $key )->getTitle()->getText();
+               }
+               return $categories;
+       }
+
+       /**
+        * Get outgoing links.
+        * @return string[]
+        */
+       public function outgoingLinks() {
+               $outgoingLinks = [];
+               foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
+                       foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
+                               $outgoingLinks[] =
+                                       Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
+                       }
+               }
+               return $outgoingLinks;
+       }
+
+       /**
+        * Get templates in the text.
+        * @return string[]
+        */
+       public function templates() {
+               $templates = [];
+               foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
+                       foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
+                               $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
+                               if ( $templateTitle && $templateTitle->exists() ) {
+                                       $templates[] = $templateTitle->getPrefixedText();
+                               }
+                       }
+               }
+               return $templates;
+       }
+
+       /**
+        * Get headings on the page.
+        * @return string[]
+        * First strip out things that look like references.  We can't use HTML filtering because
+        * the references come back as <sup> tags without a class.  To keep from breaking stuff like
+        *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
+        * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
+        * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
+        * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
+        * reference.  And since the data looks like:
+        *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
+        * we can not really use HtmlFormatter as we have no suitable selector.
+        */
+       public function headings() {
+               $headings = [];
+               $ignoredHeadings = $this->getIgnoredHeadings();
+               foreach ( $this->parserOutput->getSections() as $heading ) {
+                       $heading = $heading[ 'line' ];
+
+                       // Some wikis wrap the brackets in a span:
+                       // http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
+                       $heading = preg_replace( '/<\/?span>/', '', $heading );
+                       // Normalize [] so the following regexp would work.
+                       $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
+                       $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
+
+                       // Strip tags from the heading or else we'll display them (escaped) in search results
+                       $heading = trim( Sanitizer::stripAllTags( $heading ) );
+
+                       // Note that we don't take the level of the heading into account - all headings are equal.
+                       // Except the ones we ignore.
+                       if ( !in_array( $heading, $ignoredHeadings ) ) {
+                               $headings[] = $heading;
+                       }
+               }
+               return $headings;
+       }
+
+       /**
+        * Parse a message content into an array. This function is generally used to
+        * parse settings stored as i18n messages (see search-ignored-headings).
+        *
+        * @param string $message
+        * @return string[]
+        */
+       public static function parseSettingsInMessage( $message ) {
+               $lines = explode( "\n", $message );
+               $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
+               $lines = array_map( 'trim', $lines );          // Remove extra spaces
+               $lines = array_filter( $lines );               // Remove empty lines
+               return $lines;
+       }
+
+       /**
+        * Get list of heading to ignore.
+        * @return string[]
+        */
+       private function getIgnoredHeadings() {
+               static $ignoredHeadings = null;
+               if ( $ignoredHeadings === null ) {
+                       // FIXME: will be renamed in next patches to search-ignored-headings
+                       $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
+                       $ignoredHeadings = [];
+                       if ( !$source->isDisabled() ) {
+                               $lines = self::parseSettingsInMessage( $source->plain() );
+                               $ignoredHeadings = $lines;               // Now we just have headings!
+                       }
+               }
+               return $ignoredHeadings;
+       }
+
+       /**
+        * Extract parts of the text - opening, main and auxiliary.
+        */
+       private function extractWikitextParts() {
+               if ( !is_null( $this->allText ) ) {
+                       return;
+               }
+               $this->parserOutput->setEditSectionTokens( false );
+               $this->parserOutput->setTOCEnabled( false );
+               $text = $this->parserOutput->getText();
+               if ( strlen( $text ) == 0 ) {
+                       $this->allText = "";
+                       // empty text - nothing to seek here
+                       return;
+               }
+               $opening = null;
+
+               $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
+
+               // Add extra spacing around break tags so text crammed together like<br>this
+               // doesn't make one word.
+               $text = str_replace( '<br', "\n<br", $text );
+
+               $formatter = new HtmlFormatter( $text );
+
+               // Strip elements from the page that we never want in the search text.
+               $formatter->remove( $this->excludedElementSelectors );
+               $formatter->filterContent();
+
+               // Strip elements from the page that are auxiliary text.  These will still be
+               // searched but matches will be ranked lower and non-auxiliary matches will be
+               // preferred in highlighting.
+               $formatter->remove( $this->auxiliaryElementSelectors );
+               $auxiliaryElements = $formatter->filterContent();
+               $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
+               foreach ( $auxiliaryElements as $auxiliaryElement ) {
+                       $this->auxText[] =
+                               trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
+               }
+       }
+
+       /**
+        * Get text before first heading.
+        * @param string $text
+        * @return string|null
+        */
+       private function extractHeadingBeforeFirstHeading( $text ) {
+               $matches = [];
+               if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
+                       // There isn't a first heading so we interpret this as the article
+                       // being entirely without heading.
+                       return null;
+               }
+               $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
+               if ( !$text ) {
+                       // There isn't any text before the first heading so we declare there isn't
+                       // a first heading.
+                       return null;
+               }
+
+               $formatter = new HtmlFormatter( $text );
+               $formatter->remove( $this->excludedElementSelectors );
+               $formatter->remove( $this->auxiliaryElementSelectors );
+               $formatter->filterContent();
+               $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
+
+               if ( !$text ) {
+                       // There isn't any text after filtering before the first heading so we declare
+                       // that there isn't a first heading.
+                       return null;
+               }
+
+               return $text;
+       }
+
+       /**
+        * Get opening text
+        * @return string
+        */
+       public function getOpeningText() {
+               $this->extractWikitextParts();
+               return $this->openingText;
+       }
+
+       /**
+        * Get main text
+        * @return string
+        */
+       public function getMainText() {
+               $this->extractWikitextParts();
+               return $this->allText;
+       }
+
+       /**
+        * Get auxiliary text
+        * @return string[]
+        */
+       public function getAuxiliaryText() {
+               $this->extractWikitextParts();
+               return $this->auxText;
+       }
+}
diff --git a/includes/content/WikitextContentHandler.php b/includes/content/WikitextContentHandler.php

index 4e8f0df..5c0a9c8 100644 (file)
--- a/includes/content/WikitextContentHandler.php
+++ b/includes/content/WikitextContentHandler.php
@@ -145,4 +145,44 @@ class WikitextContentHandler extends TextContentHandler {
                 return $fields;
         }
  
+       /**
+        * Extract text of the file
+        * TODO: probably should go to file handler?
+        * @param Title $title
+        * @return string|null
+        */
+       protected function getFileText( Title $title ) {
+               $file = wfLocalFile( $title );
+               if ( $file && $file->exists() ) {
+                       return $file->getHandler()->getEntireText( $file );
+               }
+
+               return null;
+       }
+
+       public function getDataForSearchIndex( WikiPage $page, ParserOutput $parserOutput,
+                                              SearchEngine $engine ) {
+               $fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine );
+
+               $structure = new WikiTextStructure( $parserOutput );
+               $fields['external_link'] = array_keys( $parserOutput->getExternalLinks() );
+               $fields['category'] = $structure->categories();
+               $fields['heading'] = $structure->headings();
+               $fields['outgoing_link'] = $structure->outgoingLinks();
+               $fields['template'] = $structure->templates();
+               // text fields
+               $fields['opening_text'] = $structure->getOpeningText();
+               $fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler
+               $fields['auxiliary_text'] = $structure->getAuxiliaryText();
+
+               $title = $page->getTitle();
+               if ( NS_FILE == $title->getNamespace() ) {
+                       $fileText = $this->getFileText( $title );
+                       if ( $fileText ) {
+                               $fields['file_text'] = $fileText;
+                       }
+               }
+               return $fields;
+       }
+
  }
diff --git a/includes/page/WikiPage.php b/includes/page/WikiPage.php

index e7352af..c17fe1e 100644 (file)
--- a/includes/page/WikiPage.php
+++ b/includes/page/WikiPage.php
@@ -1043,14 +1043,16 @@ class WikiPage implements Page, IDBAccessObject {
          *
          * @since 1.19
          * @param ParserOptions $parserOptions ParserOptions to use for the parse operation
-        * @param null|int $oldid Revision ID to get the text from, passing null or 0 will
-        *   get the current revision (default value)
-        *
-        * @return ParserOutput|bool ParserOutput or false if the revision was not found
+        * @param null|int      $oldid Revision ID to get the text from, passing null or 0 will
+        *                             get the current revision (default value)
+        * @param bool          $forceParse Force reindexing, regardless of cache settings
+        * @return bool|ParserOutput ParserOutput or false if the revision was not found
          */
-       public function getParserOutput( ParserOptions $parserOptions, $oldid = null ) {
+       public function getParserOutput( ParserOptions $parserOptions, $oldid = null,
+                                        $forceParse = false ) {
  
-               $useParserCache = $this->shouldCheckParserCache( $parserOptions, $oldid );
+               $useParserCache =
+                       ( !$forceParse ) && $this->shouldCheckParserCache( $parserOptions, $oldid );
                 wfDebug( __METHOD__ .
                         ': using parser cache: ' . ( $useParserCache ? 'yes' : 'no' ) . "\n" );
                 if ( $parserOptions->getStubThreshold() ) {
diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php

index 9168d64..c2ccca0 100644 (file)
--- a/includes/search/SearchEngine.php
+++ b/includes/search/SearchEngine.php
@@ -659,7 +659,7 @@ abstract class SearchEngine {
          * Create a search field definition.
          * Specific search engines should override this method to create search fields.
          * @param string $name
-        * @param int    $type
+        * @param int    $type One of the types in SearchIndexField::INDEX_TYPE_*
          * @return SearchIndexField
          * @since 1.28
          */
diff --git a/tests/phpunit/MediaWikiTestCase.php b/tests/phpunit/MediaWikiTestCase.php

index 8dfe628..e6826d8 100644 (file)
--- a/tests/phpunit/MediaWikiTestCase.php
+++ b/tests/phpunit/MediaWikiTestCase.php
@@ -1775,4 +1775,15 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase {
                 return $buffer;
         }
  
+       /**
+        * Create a temporary hook handler which will be reset by tearDown.
+        * This replaces other handlers for the same hook.
+        * @param string $hookName Hook name
+        * @param mixed $handler Value suitable for a hook handler
+        * @since 1.28
+        */
+       protected function setTemporaryHook( $hookName, $handler ) {
+               $this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] );
+       }
+
  }
diff --git a/tests/phpunit/includes/content/ContentHandlerTest.php b/tests/phpunit/includes/content/ContentHandlerTest.php

index 545b964..bb9050f 100644 (file)
--- a/tests/phpunit/includes/content/ContentHandlerTest.php
+++ b/tests/phpunit/includes/content/ContentHandlerTest.php
@@ -3,6 +3,7 @@ use MediaWiki\MediaWikiServices;
  
  /**
   * @group ContentHandler
+ * @group Database
   */
  class ContentHandlerTest extends MediaWikiTestCase {
  
@@ -52,6 +53,11 @@ class ContentHandlerTest extends MediaWikiTestCase {
                 parent::tearDown();
         }
  
+       public function addDBDataOnce() {
+               $this->insertPage( 'Not_Main_Page', 'This is not a main page' );
+               $this->insertPage( 'Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]' );
+       }
+
         public static function dataGetDefaultModelFor() {
                 return [
                         [ 'Help:Foo', CONTENT_MODEL_WIKITEXT ],
@@ -409,4 +415,39 @@ class ContentHandlerTest extends MediaWikiTestCase {
                 $this->assertInstanceOf( $handlerClass, $handler );
         }
  
+       /**
+        * @covers ContentHandler::getDataForSearchIndex
+        */
+       public function testDataIndexFields() {
+               $mockEngine = $this->getMock( 'SearchEngine' );
+               $title = Title::newFromText( 'Not_Main_Page', NS_MAIN );
+               $page = new WikiPage( $title );
+
+               $this->setTemporaryHook( 'SearchDataForIndex',
+                       function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output,
+                                  SearchEngine $engine ) {
+                               $fields['testDataField'] = 'test content';
+                       } );
+
+               $output = $page->getContent()->getParserOutput( $title );
+               $data = $page->getContentHandler()->getDataForSearchIndex( $page, $output, $mockEngine );
+               $this->assertArrayHasKey( 'text', $data );
+               $this->assertArrayHasKey( 'text_bytes', $data );
+               $this->assertArrayHasKey( 'language', $data );
+               $this->assertArrayHasKey( 'testDataField', $data );
+               $this->assertEquals( 'test content', $data['testDataField'] );
+       }
+
+       /**
+        * @covers ContentHandler::getParserOutputForIndexing
+        */
+       public function testParserOutputForIndexing() {
+               $title = Title::newFromText( 'Smithee', NS_MAIN );
+               $page = new WikiPage( $title );
+
+               $out = $page->getContentHandler()->getParserOutputForIndexing( $page );
+               $this->assertInstanceOf( ParserOutput::class, $out );
+               $this->assertContains( 'one who smiths', $out->getRawText() );
+       }
+
  }
diff --git a/tests/phpunit/includes/content/TextContentHandlerTest.php b/tests/phpunit/includes/content/TextContentHandlerTest.php

index e8681c7..918815c 100644 (file)
--- a/tests/phpunit/includes/content/TextContentHandlerTest.php
+++ b/tests/phpunit/includes/content/TextContentHandlerTest.php
@@ -49,5 +49,4 @@ class TextContentHandlerTest extends MediaWikiLangTestCase {
                 $this->assertEquals( 'test', $mappedFields['language']['testData'] );
                 $this->assertEquals( 'language', $mappedFields['language']['name'] );
         }
-
  }
diff --git a/tests/phpunit/includes/content/WikitextContentHandlerTest.php b/tests/phpunit/includes/content/WikitextContentHandlerTest.php

index f632882..9d4abe8 100644 (file)
--- a/tests/phpunit/includes/content/WikitextContentHandlerTest.php
+++ b/tests/phpunit/includes/content/WikitextContentHandlerTest.php
@@ -243,4 +243,20 @@ class WikitextContentHandlerTest extends MediaWikiLangTestCase {
         ) {
         }
         */
+
+       public function testDataIndexFieldsFile() {
+               $mockEngine = $this->getMock( 'SearchEngine' );
+               $title = Title::newFromText( 'Somefile.jpg', NS_FILE );
+               $page = new WikiPage( $title );
+
+               $handler = $this->getMockBuilder( WikitextContentHandler::class )
+                       ->disableOriginalConstructor()
+                       ->setMethods( [ 'getFileText' ] )
+                       ->getMock();
+               $handler->method( 'getFileText' )->will( $this->returnValue( 'This is file content' ) );
+
+               $data = $handler->getDataForSearchIndex( $page, new ParserOutput(), $mockEngine );
+               $this->assertArrayHasKey( 'file_text', $data );
+               $this->assertEquals( 'This is file content', $data['file_text'] );
+       }
  }
diff --git a/tests/phpunit/includes/content/WikitextStructureTest.php b/tests/phpunit/includes/content/WikitextStructureTest.php

new file mode 100644 (file)

index 0000000..d4647f1
--- /dev/null
+++ b/tests/phpunit/includes/content/WikitextStructureTest.php
@@ -0,0 +1,148 @@
+<?php
+
+class WikitextStructureTest extends MediaWikiLangTestCase {
+
+       private function getMockTitle() {
+               return Title::newFromText( "TestTitle" );
+       }
+
+       /**
+        * Get parser output for Wiki text
+        * @param $text
+        * @return ParserOutput
+        */
+       private function getParserOutput( $text ) {
+               $content = new WikitextContent( $text );
+               return $content->getParserOutput( $this->getMockTitle() );
+       }
+
+       /**
+        * Get WikitextStructure for given text
+        * @param $text
+        * @return WikiTextStructure
+        */
+       private function getStructure( $text ) {
+               return new WikiTextStructure( $this->getParserOutput( $text ) );
+       }
+
+       public function testCategories() {
+               $text = <<<END
+We also have a {{Template}} and an {{Another template}} in addition. 
+This text also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
+And [[Category:Some Category| this category]] is repeated.
+END;
+               $struct = $this->getStructure( $text );
+               $cats = $struct->categories();
+               $this->assertCount( 2, $cats );
+               $this->assertContains( "Some Category", $cats );
+               $this->assertContains( "Yet another category", $cats );
+       }
+
+       public function testOutgoingLinks() {
+               $text = <<<END
+Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked twice. 
+We also have [[File:Image.jpg|image]].
+We also have a {{Template}} and an {{Another template}} in addition. 
+Some templates are {{lowercase}}.
+And [[Some_Page]] is linked again. 
+It also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
+Also link to a [[Talk:TestTitle|talk page]] is here. 
+END;
+               $struct = $this->getStructure( $text );
+               $links = $struct->outgoingLinks();
+               $this->assertContains( "Some_Page", $links );
+               $this->assertContains( "Template:Template", $links );
+               $this->assertContains( "Template:Another_template", $links );
+               $this->assertContains( "Template:Lowercase", $links );
+               $this->assertContains( "Talk:TestTitle", $links );
+               $this->assertCount( 5, $links );
+       }
+
+       public function testTemplates() {
+               $text = <<<END
+We have a {{Template}} and an {{Another template}} in addition. 
+Some templates are {{lowercase}}. And this {{Template}} is repeated. 
+Here is {{another_template|with=argument}}.
+This is a template that {{Xdoes not exist}}.
+END;
+               $this->setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) {
+                       $txt = $title->getBaseText();
+                       if ( $txt[0] != 'X' ) {
+                               $exists = true;
+                       }
+                       return true;
+               } );
+               $struct = $this->getStructure( $text );
+               $templates = $struct->templates();
+               $this->assertCount( 3, $templates );
+               $this->assertContains( "Template:Template", $templates );
+               $this->assertContains( "Template:Another template", $templates );
+               $this->assertContains( "Template:Lowercase", $templates );
+       }
+
+       public function testHeadings() {
+               $text = <<<END
+Some text here
+== Heading one ==
+Some text
+==== heading two ====
+More text
+=== Applicability of the strict mass-energy equivalence formula, ''E'' = ''mc''<sup>2</sup> ===
+and more text
+== Wikitext '''in''' [[Heading]] and also <b>html</b> ==
+more text
+END;
+// FIXME: add test for ==== See also ==== after cirrussearch-ignored-headings is renamed
+               $struct = $this->getStructure( $text );
+               $headings = $struct->headings();
+               $this->assertCount( 4, $headings );
+               $this->assertContains( "Heading one", $headings );
+               $this->assertContains( "heading two", $headings );
+               $this->assertContains( "Applicability of the strict mass-energy equivalence formula, E = mc2",
+                       $headings );
+               $this->assertContains( "Wikitext in Heading and also html", $headings );
+       }
+
+       public function testHeadingsFirst() {
+               $text = <<<END
+== Heading one ==
+Some text
+==== heading two ====
+END;
+               $struct = $this->getStructure( $text );
+               $headings = $struct->headings();
+               $this->assertCount( 2, $headings );
+               $this->assertContains( "Heading one", $headings );
+               $this->assertContains( "heading two", $headings );
+       }
+
+       public function testHeadingsNone() {
+               $text = "This text is completely devoid of headings.";
+               $struct = $this->getStructure( $text );
+               $headings = $struct->headings();
+               $this->assertArrayEquals( [], $headings );
+       }
+
+       public function testTexts() {
+               $text = <<<END
+Opening text is opening.
+== Then comes header ==
+Then we got more<br>text
+=== And more headers ===
+{| class="wikitable"
+|-
+! Header table
+|-
+| row in table
+|-
+| another row in table
+|}
+END;
+               $struct = $this->getStructure( $text );
+               $this->assertEquals( "Opening text is opening.", $struct->getOpeningText() );
+               $this->assertEquals( "Opening text is opening.   Then we got more text",
+                       $struct->getMainText() );
+               $this->assertEquals( [ "Header table  row in table  another row in table" ],
+                       $struct->getAuxiliaryText() );
+       }
+}
diff --git a/tests/phpunit/includes/search/SearchEngineTest.php b/tests/phpunit/includes/search/SearchEngineTest.php

index f084c64..081cb38 100644 (file)
--- a/tests/phpunit/includes/search/SearchEngineTest.php
+++ b/tests/phpunit/includes/search/SearchEngineTest.php
@@ -185,8 +185,12 @@ class SearchEngineTest extends MediaWikiLangTestCase {
                         ->willReturnCallback( $mockFieldBuilder );
  
                 // Not using mock since PHPUnit mocks do not work properly with references in params
-               $this->mergeMwGlobalArrayValue( 'wgHooks',
-                       [ 'SearchIndexFields' => [ [ $this, 'hookSearchIndexFields', $mockFieldBuilder ] ] ] );
+               $this->setTemporaryHook( 'SearchIndexFields',
+                       function ( &$fields, SearchEngine $engine ) use ( $mockFieldBuilder ) {
+                               $fields['testField'] =
+                                       $mockFieldBuilder( "testField", SearchIndexField::INDEX_TYPE_TEXT );
+                               return true;
+                       } );
  
                 $fields = $mockEngine->getSearchIndexFields();
                 $this->assertArrayHasKey( 'language', $fields );
@@ -197,9 +201,4 @@ class SearchEngineTest extends MediaWikiLangTestCase {
                 $this->assertArrayHasKey( 'testData', $mapping );
                 $this->assertEquals( 'test', $mapping['testData'] );
         }
-
-       public function hookSearchIndexFields( $mockFieldBuilder, &$fields, SearchEngine $engine ) {
-               $fields['testField'] = $mockFieldBuilder( "testField", SearchIndexField::INDEX_TYPE_TEXT );
-               return true;
-       }
  }
author	Stanislav Malyshev <smalyshev@gmail.com>
	Mon, 16 May 2016 20:24:10 +0000 (13:24 -0700)
committer	Stanislav Malyshev <smalyshev@gmail.com>
	Tue, 26 Jul 2016 20:08:45 +0000 (13:08 -0700)
autoload.php		patch \| blob \| history
docs/hooks.txt		patch \| blob \| history
includes/content/ContentHandler.php		patch \| blob \| history
includes/content/TextContentHandler.php		patch \| blob \| history
includes/content/WikiTextStructure.php	[new file with mode: 0644]	patch \| blob
includes/content/WikitextContentHandler.php		patch \| blob \| history
includes/page/WikiPage.php		patch \| blob \| history
includes/search/SearchEngine.php		patch \| blob \| history
tests/phpunit/MediaWikiTestCase.php		patch \| blob \| history
tests/phpunit/includes/content/ContentHandlerTest.php		patch \| blob \| history
tests/phpunit/includes/content/TextContentHandlerTest.php		patch \| blob \| history
tests/phpunit/includes/content/WikitextContentHandlerTest.php		patch \| blob \| history
tests/phpunit/includes/content/WikitextStructureTest.php	[new file with mode: 0644]	patch \| blob
tests/phpunit/includes/search/SearchEngineTest.php		patch \| blob \| history