Merge "Extract ParserOutput search index data fields from WikiTextContentHandler"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Fri, 19 Aug 2016 18:40:17 +0000 (18:40 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Fri, 19 Aug 2016 18:40:17 +0000 (18:40 +0000)
autoload.php
includes/content/ContentHandler.php
includes/content/WikiTextStructure.php
includes/content/WikitextContentHandler.php
includes/search/DummySearchIndexFieldDefinition.php [new file with mode: 0644]
includes/search/ParserOutputSearchDataExtractor.php [new file with mode: 0644]
includes/search/SearchIndexFieldDefinition.php
tests/phpunit/includes/content/ContentHandlerTest.php
tests/phpunit/includes/content/WikitextStructureTest.php
tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php [new file with mode: 0644]

index fc6577e..e37a011 100644 (file)
@@ -372,6 +372,7 @@ $wgAutoloadLocalClasses = [
        'DoubleRedirectsPage' => __DIR__ . '/includes/specials/SpecialDoubleRedirects.php',
        'DoubleReplacer' => __DIR__ . '/includes/libs/replacers/DoubleReplacer.php',
        'DummyLinker' => __DIR__ . '/includes/DummyLinker.php',
+       'DummySearchIndexFieldDefinition' => __DIR__ . '/includes/search/DummySearchIndexFieldDefinition.php',
        'DummyTermColorer' => __DIR__ . '/maintenance/term/MWTerm.php',
        'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php',
        'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php',
@@ -857,6 +858,7 @@ $wgAutoloadLocalClasses = [
        'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php',
        'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
        'MediaWiki\\MediaWikiServices' => __DIR__ . '/includes/MediaWikiServices.php',
+       'MediaWiki\\Search\\ParserOutputSearchDataExtractor' => __DIR__ . '/includes/search/ParserOutputSearchDataExtractor.php',
        'MediaWiki\\Services\\CannotReplaceActiveServiceException' => __DIR__ . '/includes/Services/CannotReplaceActiveServiceException.php',
        'MediaWiki\\Services\\ContainerDisabledException' => __DIR__ . '/includes/Services/ContainerDisabledException.php',
        'MediaWiki\\Services\\DestructibleService' => __DIR__ . '/includes/Services/DestructibleService.php',
index 5a5c0d8..41fdef5 100644 (file)
@@ -1,4 +1,7 @@
 <?php
+
+use MediaWiki\Search\ParserOutputSearchDataExtractor;
+
 /**
  * Base class for content handling.
  *
@@ -1208,24 +1211,40 @@ abstract class ContentHandler {
 
        /**
         * Get fields definition for search index
+        *
+        * @todo Expose title, redirect, namespace, text, source_text, text_bytes
+        *       field mappings here. (see T142670 and T143409)
+        *
         * @param SearchEngine $engine
         * @return SearchIndexField[] List of fields this content handler can provide.
         * @since 1.28
         */
        public function getFieldsForSearchIndex( SearchEngine $engine ) {
-               /* Default fields:
-               /*
-                * namespace
-                * namespace_text
-                * redirect
-                * source_text
-                * suggest
-                * timestamp
-                * title
-                * text
-                * text_bytes
-                */
-               return [];
+               $fields['category'] = $engine->makeSearchFieldMapping(
+                       'category',
+                       SearchIndexField::INDEX_TYPE_TEXT
+               );
+
+               $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD );
+
+               $fields['external_link'] = $engine->makeSearchFieldMapping(
+                       'external_link',
+                       SearchIndexField::INDEX_TYPE_KEYWORD
+               );
+
+               $fields['outgoing_link'] = $engine->makeSearchFieldMapping(
+                       'outgoing_link',
+                       SearchIndexField::INDEX_TYPE_KEYWORD
+               );
+
+               $fields['template'] = $engine->makeSearchFieldMapping(
+                       'template',
+                       SearchIndexField::INDEX_TYPE_KEYWORD
+               );
+
+               $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD );
+
+               return $fields;
        }
 
        /**
@@ -1255,16 +1274,26 @@ abstract class ContentHandler {
         */
        public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
                                               SearchEngine $engine ) {
-               $fields = [];
+               $fieldData = [];
                $content = $page->getContent();
+
                if ( $content ) {
+                       $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+                       $fieldData['category'] = $searchDataExtractor->getCategories( $output );
+                       $fieldData['external_link'] = $searchDataExtractor->getExternalLinks( $output );
+                       $fieldData['outgoing_link'] = $searchDataExtractor->getOutgoingLinks( $output );
+                       $fieldData['template'] = $searchDataExtractor->getTemplates( $output );
+
                        $text = $content->getTextForSearchIndex();
-                       $fields['text'] = $text;
-                       $fields['source_text'] = $text;
-                       $fields['text_bytes'] = $content->getSize();
+
+                       $fieldData['text'] = $text;
+                       $fieldData['source_text'] = $text;
+                       $fieldData['text_bytes'] = $content->getSize();
                }
-               Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] );
-               return $fields;
+
+               Hooks::run( 'SearchDataForIndex', [ &$fieldData, $this, $page, $output, $engine ] );
+               return $fieldData;
        }
 
        /**
index e83c213..9768d36 100644 (file)
@@ -58,50 +58,6 @@ class WikiTextStructure {
                $this->parserOutput = $parserOutput;
        }
 
-       /**
-        * Get categories in the text.
-        * @return string[]
-        */
-       public function categories() {
-               $categories = [];
-               foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
-                       $categories[] = Category::newFromName( $key )->getTitle()->getText();
-               }
-               return $categories;
-       }
-
-       /**
-        * Get outgoing links.
-        * @return string[]
-        */
-       public function outgoingLinks() {
-               $outgoingLinks = [];
-               foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
-                       foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
-                               $outgoingLinks[] =
-                                       Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
-                       }
-               }
-               return $outgoingLinks;
-       }
-
-       /**
-        * Get templates in the text.
-        * @return string[]
-        */
-       public function templates() {
-               $templates = [];
-               foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
-                       foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
-                               $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
-                               if ( $templateTitle && $templateTitle->exists() ) {
-                                       $templates[] = $templateTitle->getPrefixedText();
-                               }
-                       }
-               }
-               return $templates;
-       }
-
        /**
         * Get headings on the page.
         * @return string[]
index 3ad7665..1c46d28 100644 (file)
@@ -111,13 +111,6 @@ class WikitextContentHandler extends TextContentHandler {
        public function getFieldsForSearchIndex( SearchEngine $engine ) {
                $fields = parent::getFieldsForSearchIndex( $engine );
 
-               $fields['category'] =
-                       $engine->makeSearchFieldMapping( 'category', SearchIndexField::INDEX_TYPE_TEXT );
-               $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD );
-
-               $fields['external_link'] =
-                       $engine->makeSearchFieldMapping( 'external_link', SearchIndexField::INDEX_TYPE_KEYWORD );
-
                $fields['heading'] =
                        $engine->makeSearchFieldMapping( 'heading', SearchIndexField::INDEX_TYPE_TEXT );
                $fields['heading']->setFlag( SearchIndexField::FLAG_SCORING );
@@ -130,13 +123,6 @@ class WikitextContentHandler extends TextContentHandler {
                $fields['opening_text']->setFlag( SearchIndexField::FLAG_SCORING |
                                                  SearchIndexField::FLAG_NO_HIGHLIGHT );
 
-               $fields['outgoing_link'] =
-                       $engine->makeSearchFieldMapping( 'outgoing_link', SearchIndexField::INDEX_TYPE_KEYWORD );
-
-               $fields['template'] =
-                       $engine->makeSearchFieldMapping( 'template', SearchIndexField::INDEX_TYPE_KEYWORD );
-               $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD );
-
                // FIXME: this really belongs in separate file handler but files
                // do not have separate handler. Sadness.
                $fields['file_text'] =
@@ -169,11 +155,7 @@ class WikitextContentHandler extends TextContentHandler {
                $fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine );
 
                $structure = new WikiTextStructure( $parserOutput );
-               $fields['external_link'] = array_keys( $parserOutput->getExternalLinks() );
-               $fields['category'] = $structure->categories();
                $fields['heading'] = $structure->headings();
-               $fields['outgoing_link'] = $structure->outgoingLinks();
-               $fields['template'] = $structure->templates();
                // text fields
                $fields['opening_text'] = $structure->getOpeningText();
                $fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler
diff --git a/includes/search/DummySearchIndexFieldDefinition.php b/includes/search/DummySearchIndexFieldDefinition.php
new file mode 100644 (file)
index 0000000..a2a6760
--- /dev/null
@@ -0,0 +1,30 @@
+<?php
+
+/**
+ * Dummy implementation of SearchIndexFieldDefinition for testing purposes.
+ *
+ * @since 1.28
+ */
+class DummySearchIndexFieldDefinition extends SearchIndexFieldDefinition {
+
+       /**
+        * @param SearchEngine $engine
+        *
+        * @return array
+        */
+       public function getMapping( SearchEngine $engine ) {
+               $mapping = [
+                       'name' => $this->name,
+                       'type' => $this->type,
+                       'flags' => $this->flags,
+                       'subfields' => []
+               ];
+
+               foreach ( $this->subfields as $subfield ) {
+                       $mapping['subfields'][] = $subfield->getMapping();
+               }
+
+               return $mapping;
+       }
+
+}
diff --git a/includes/search/ParserOutputSearchDataExtractor.php b/includes/search/ParserOutputSearchDataExtractor.php
new file mode 100644 (file)
index 0000000..df653f1
--- /dev/null
@@ -0,0 +1,92 @@
+<?php
+
+namespace MediaWiki\Search;
+
+use Category;
+use ParserOutput;
+use Title;
+
+/**
+ * Extracts data from ParserOutput for indexing in the search engine.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.28
+ */
+class ParserOutputSearchDataExtractor {
+
+       /**
+        * Get a list of categories, as an array with title text strings.
+        *
+        * @return string[]
+        */
+       public function getCategories( ParserOutput $parserOutput ) {
+               $categories = [];
+
+               foreach ( $parserOutput->getCategoryLinks() as $key ) {
+                       $categories[] = Category::newFromName( $key )->getTitle()->getText();
+               }
+
+               return $categories;
+       }
+
+       /**
+        * Get a list of external links from ParserOutput, as an array of strings.
+        *
+        * @return string[]
+        */
+       public function getExternalLinks( ParserOutput $parserOutput ) {
+               return array_keys( $parserOutput->getExternalLinks() );
+       }
+
+       /**
+        * Get a list of outgoing wiki links (including interwiki links), as
+        * an array of prefixed title strings.
+        *
+        * @return string[]
+        */
+       public function getOutgoingLinks( ParserOutput $parserOutput ) {
+               $outgoingLinks = [];
+
+               foreach ( $parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
+                       foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
+                               $outgoingLinks[] =
+                                       Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
+                       }
+               }
+
+               return $outgoingLinks;
+       }
+
+       /**
+        * Get a list of templates used in the ParserOutput content, as prefixed title strings
+        *
+        * @return string[]
+        */
+       public function getTemplates( ParserOutput $parserOutput ) {
+               $templates = [];
+
+               foreach ( $parserOutput->getTemplates() as $tNS => $templatesInNS ) {
+                       foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
+                               $templateTitle = Title::makeTitle( $tNS, $tDbKey );
+                               $templates[] = $templateTitle->getPrefixedText();
+                       }
+               }
+
+               return $templates;
+       }
+
+}
index 3a86c82..8a06b65 100644 (file)
@@ -2,8 +2,10 @@
 
 /**
  * Basic infrastructure of the field definition.
- * Specific engines will need to override it at least for getMapping,
- * but can reuse other parts.
+ *
+ * Specific engines should extend this class and at at least,
+ * override the getMapping method, but can reuse other parts.
+ *
  * @since 1.28
  */
 abstract class SearchIndexFieldDefinition implements SearchIndexField {
@@ -115,4 +117,12 @@ abstract class SearchIndexFieldDefinition implements SearchIndexField {
                $this->subfields = $subfields;
                return $this;
        }
+
+       /**
+        * @param SearchEngine $engine
+        *
+        * @return array
+        */
+       abstract public function getMapping( SearchEngine $engine );
+
 }
index 6168182..39948ca 100644 (file)
@@ -414,6 +414,32 @@ class ContentHandlerTest extends MediaWikiTestCase {
                $this->assertInstanceOf( $handlerClass, $handler );
        }
 
+       public function testGetFieldsForSearchIndex() {
+               $searchEngine = $this->newSearchEngine();
+
+               $handler = ContentHandler::getForModelID( CONTENT_MODEL_WIKITEXT );
+
+               $fields = $handler->getFieldsForSearchIndex( $searchEngine );
+
+               $this->assertArrayHasKey( 'category', $fields );
+               $this->assertArrayHasKey( 'external_link', $fields );
+               $this->assertArrayHasKey( 'outgoing_link', $fields );
+               $this->assertArrayHasKey( 'template', $fields );
+       }
+
+       private function newSearchEngine() {
+               $searchEngine = $this->getMockBuilder( 'SearchEngine' )
+                       ->getMock();
+
+               $searchEngine->expects( $this->any() )
+                       ->method( 'makeSearchFieldMapping' )
+                       ->will( $this->returnCallback( function( $name, $type ) {
+                                       return new DummySearchIndexFieldDefinition( $name, $type );
+                       } ) );
+
+               return $searchEngine;
+       }
+
        /**
         * @covers ContentHandler::getDataForSearchIndex
         */
@@ -424,7 +450,7 @@ class ContentHandlerTest extends MediaWikiTestCase {
 
                $this->setTemporaryHook( 'SearchDataForIndex',
                        function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output,
-                                  SearchEngine $engine ) {
+                                          SearchEngine $engine ) {
                                $fields['testDataField'] = 'test content';
                        } );
 
index 6d83057..4301fb8 100644 (file)
@@ -25,61 +25,6 @@ class WikitextStructureTest extends MediaWikiLangTestCase {
                return new WikiTextStructure( $this->getParserOutput( $text ) );
        }
 
-       public function testCategories() {
-               $text = <<<END
-We also have a {{Template}} and an {{Another template}} in addition. 
-This text also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
-And [[Category:Some Category| this category]] is repeated.
-END;
-               $struct = $this->getStructure( $text );
-               $cats = $struct->categories();
-               $this->assertCount( 2, $cats );
-               $this->assertContains( "Some Category", $cats );
-               $this->assertContains( "Yet another category", $cats );
-       }
-
-       public function testOutgoingLinks() {
-               $text = <<<END
-Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked twice. 
-We also have [[File:Image.jpg|image]].
-We also have a {{Template}} and an {{Another template}} in addition. 
-Some templates are {{lowercase}}.
-And [[Some_Page]] is linked again. 
-It also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
-Also link to a [[Talk:TestTitle|talk page]] is here. 
-END;
-               $struct = $this->getStructure( $text );
-               $links = $struct->outgoingLinks();
-               $this->assertContains( "Some_Page", $links );
-               $this->assertContains( "Template:Template", $links );
-               $this->assertContains( "Template:Another_template", $links );
-               $this->assertContains( "Template:Lowercase", $links );
-               $this->assertContains( "Talk:TestTitle", $links );
-               $this->assertCount( 5, $links );
-       }
-
-       public function testTemplates() {
-               $text = <<<END
-We have a {{Template}} and an {{Another template}} in addition. 
-Some templates are {{lowercase}}. And this {{Template}} is repeated. 
-Here is {{another_template|with=argument}}.
-This is a template that {{Xdoes not exist}}.
-END;
-               $this->setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) {
-                       $txt = $title->getBaseText();
-                       if ( $txt[0] != 'X' ) {
-                               $exists = true;
-                       }
-                       return true;
-               } );
-               $struct = $this->getStructure( $text );
-               $templates = $struct->templates();
-               $this->assertCount( 3, $templates );
-               $this->assertContains( "Template:Template", $templates );
-               $this->assertContains( "Template:Another template", $templates );
-               $this->assertContains( "Template:Lowercase", $templates );
-       }
-
        public function testHeadings() {
                $text = <<<END
 Some text here
diff --git a/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php b/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php
new file mode 100644 (file)
index 0000000..69d0b76
--- /dev/null
@@ -0,0 +1,70 @@
+<?php
+
+use MediaWiki\Search\ParserOutputSearchDataExtractor;
+
+/**
+ * @group Search
+ * @covers MediaWiki\Search\ParserOutputSearchDataExtractor
+ */
+class ParserOutputSearchDataExtractorTest extends MediaWikiLangTestCase {
+
+       public function testGetCategories() {
+               $categories = [
+                       'Foo_bar' => 'Bar',
+                       'New_page' => ''
+               ];
+
+               $parserOutput = new ParserOutput( '', [], $categories );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               $this->assertEquals(
+                       [ 'Foo bar', 'New page' ],
+                       $searchDataExtractor->getCategories( $parserOutput )
+               );
+       }
+
+       public function testGetExternalLinks() {
+               $parserOutput = new ParserOutput();
+
+               $parserOutput->addExternalLink( 'https://foo' );
+               $parserOutput->addExternalLink( 'https://bar' );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               $this->assertEquals(
+                       [ 'https://foo', 'https://bar' ],
+                       $searchDataExtractor->getExternalLinks( $parserOutput )
+               );
+       }
+
+       public function testGetOutgoingLinks() {
+               $parserOutput = new ParserOutput();
+
+               $parserOutput->addLink( Title::makeTitle( NS_MAIN, 'Foo_bar' ), 1 );
+               $parserOutput->addLink( Title::makeTitle( NS_HELP, 'Contents' ), 2 );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               // this indexes links with db key
+               $this->assertEquals(
+                       [ 'Foo_bar', 'Help:Contents' ],
+                       $searchDataExtractor->getOutgoingLinks( $parserOutput )
+               );
+       }
+
+       public function testGetTemplates() {
+               $title = Title::makeTitle( NS_TEMPLATE, 'Cite_news' );
+
+               $parserOutput = new ParserOutput();
+               $parserOutput->addTemplate( $title, 10, 100 );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               $this->assertEquals(
+                       [ 'Template:Cite news' ],
+                       $searchDataExtractor->getTemplates( $parserOutput )
+               );
+       }
+
+}