Add more data to categories dump
authorStanislav Malyshev <smalyshev@gmail.com>
Fri, 17 Nov 2017 23:58:37 +0000 (15:58 -0800)
committerSmalyshev <smalyshev@wikimedia.org>
Wed, 17 Jan 2018 00:40:16 +0000 (00:40 +0000)
Hidden category: <cat> a mediawiki:HiddenCategory .
Pages count: <cat> mediawiki:pages 10 .
Subcats count: <cat> mediawiki:subcategories 5 .

Note that pages count includes only actual articles, unlike mediawiki
table.

Bug: T173980
Change-Id: I6d34c58f844411f891195776406e11acd2aef7b1

docs/ontology.owl
includes/CategoriesRdf.php
maintenance/dumpCategoriesAsRdf.php
tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt
tests/phpunit/maintenance/categoriesRdfTest.php

index 6b2e0b7..19476a3 100644 (file)
     <rdfs:comment>MediaWiki category.</rdfs:comment>
   </owl:Class>
 
+  <owl:Class rdf:about="&mediawiki;HiddenCategory">
+    <rdfs:label>HiddenCategory</rdfs:label>
+    <rdfs:comment>MediaWiki hidden category.</rdfs:comment>
+  </owl:Class>
+
   <!--
   ///////////////////////////////////////////////////////////////////////////////////////
   //
       <rdfs:domain rdf:resource="&mediawiki;Category"/>
   </owl:ObjectProperty>
 
+  <owl:DatatypeProperty rdf:about="&mediawiki;pages">
+      <rdfs:label>pages</rdfs:label>
+      <rdfs:comment>Number of articles belonging to this category.</rdfs:comment>
+      <rdfs:range rdf:resource="&mediawiki;Category"/>
+      <rdfs:range rdf:resource="&xsd;integer"/>
+  </owl:DatatypeProperty>
+
+  <owl:DatatypeProperty rdf:about="&mediawiki;subcategories">
+      <rdfs:label>subcategories</rdfs:label>
+      <rdfs:comment>Number of subcategories belonging to this category.</rdfs:comment>
+      <rdfs:range rdf:resource="&mediawiki;Category"/>
+      <rdfs:range rdf:resource="&xsd;integer"/>
+  </owl:DatatypeProperty>
+
 </rdf:RDF>
index 463f6e8..fc296d4 100644 (file)
@@ -80,14 +80,23 @@ class CategoriesRdf {
        /**
         * Write out the data for single category.
         * @param string $categoryName Category name
+        * @param bool $isHidden Hidden category?
+        * @param int $pages Page count (note this includes only Wiki articles, not subcats or files)
+        * @param int $subcategories Subcategory count
         */
-       public function writeCategoryData( $categoryName ) {
+       public function writeCategoryData( $categoryName, $isHidden, $pages, $subcategories ) {
                $title = Title::makeTitle( NS_CATEGORY, $categoryName );
                $this->rdfWriter->about( $this->titleToUrl( $title ) )
                        ->say( 'a' )
                        ->is( self::ONTOLOGY_PREFIX, 'Category' );
+               if ( $isHidden ) {
+                       $this->rdfWriter->is( self::ONTOLOGY_PREFIX, 'HiddenCategory' );
+               }
                $titletext = $title->getText();
                $this->rdfWriter->say( 'rdfs', 'label' )->value( $titletext );
+               $this->rdfWriter->say( self::ONTOLOGY_PREFIX, 'pages' )->value( $pages );
+               $this->rdfWriter->say( self::ONTOLOGY_PREFIX, 'subcategories' )->value( $subcategories );
+               // TODO: do we want files too here? Easy to add, but don't have use case so far.
        }
 
        /**
index c1835d0..3467932 100644 (file)
@@ -58,14 +58,32 @@ class DumpCategoriesAsRdf extends Maintenance {
        public function getCategoryIterator( IDatabase $dbr ) {
                $it = new BatchRowIterator(
                        $dbr,
-                       'page',
+                       [ 'page', 'page_props', 'category' ],
                        [ 'page_title' ],
                        $this->getBatchSize()
                );
                $it->addConditions( [
                        'page_namespace' => NS_CATEGORY,
                ] );
-               $it->setFetchColumns( [ 'page_title', 'page_id' ] );
+               $it->setFetchColumns( [
+                       'page_title',
+                       'page_id',
+                       'pp_propname',
+                       'cat_pages',
+                       'cat_subcats',
+                       'cat_files'
+               ] );
+               $it->addJoinConditions(
+                       [
+                               'page_props' => [
+                                       'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
+                               ],
+                               'category' => [
+                                       'LEFT JOIN', [ 'cat_title = page_title' ]
+                               ]
+                       ]
+
+               );
                return $it;
        }
 
@@ -90,6 +108,9 @@ class DumpCategoriesAsRdf extends Maintenance {
                return new RecursiveIteratorIterator( $it );
        }
 
+       /**
+        * @param int $timestamp
+        */
        public function addDumpHeader( $timestamp ) {
                global $wgRightsUrl;
                $licenseUrl = $wgRightsUrl;
@@ -129,7 +150,12 @@ class DumpCategoriesAsRdf extends Maintenance {
                foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
                        $pages = [];
                        foreach ( $batch as $row ) {
-                               $this->categoriesRdf->writeCategoryData( $row->page_title );
+                               $this->categoriesRdf->writeCategoryData(
+                                       $row->page_title,
+                                       $row->pp_propname === 'hiddencat',
+                                       (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
+                                       (int)$row->cat_subcats
+                               );
                                $pages[$row->page_id] = $row->page_title;
                        }
 
index b8bd8e0..bbb3787 100644 (file)
@@ -7,10 +7,17 @@
 <http://acme.test/wiki/Special:CategoryDump> <http://www.w3.org/2002/07/owl#imports> <https://www.mediawiki.org/ontology/ontology.owl> .
 <http://acme.test/wiki/Category:Category_One> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
 <http://acme.test/wiki/Category:Category_One> <http://www.w3.org/2000/01/rdf-schema#label> "Category One" .
+<http://acme.test/wiki/Category:Category_One> <https://www.mediawiki.org/ontology#pages> "7"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/wiki/Category:Category_One> <https://www.mediawiki.org/ontology#subcategories> "10"^^<http://www.w3.org/2001/XMLSchema#integer> .
 <http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#HiddenCategory> .
 <http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/2000/01/rdf-schema#label> "2 Category Two" .
+<http://acme.test/wiki/Category:2_Category_Two> <https://www.mediawiki.org/ontology#pages> "17"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/wiki/Category:2_Category_Two> <https://www.mediawiki.org/ontology#subcategories> "0"^^<http://www.w3.org/2001/XMLSchema#integer> .
 <http://acme.test/wiki/Category:Category_One> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_1> .
 <http://acme.test/wiki/Category:2_Category_Two> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_2> .
 <http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
 <http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <http://www.w3.org/2000/01/rdf-schema#label> "\u0422\u0440\u0435\u0442\u044C\u044F \u043A\u0430\u0442\u0435\u0433\u043E\u0440\u0438\u044F" .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <https://www.mediawiki.org/ontology#pages> "0"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <https://www.mediawiki.org/ontology#subcategories> "0"^^<http://www.w3.org/2001/XMLSchema#integer> .
 <http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_3> .
index 2edbae1..c0850ab 100644 (file)
@@ -9,12 +9,33 @@ class CategoriesRdfTest extends MediaWikiLangTestCase {
                return [
                        // batch 1
                        [
-                               (object)[ 'page_title' => 'Category One', 'page_id' => 1 ],
-                               (object)[ 'page_title' => '2 Category Two', 'page_id' => 2 ],
+                               (object)[
+                                       'page_title' => 'Category One',
+                                       'page_id' => 1,
+                                       'pp_propname' => null,
+                                       'cat_pages' => '20',
+                                       'cat_subcats' => '10',
+                                       'cat_files' => '3'
+                               ],
+                               (object)[
+                                       'page_title' => '2 Category Two',
+                                       'page_id' => 2,
+                                       'pp_propname' => 'hiddencat',
+                                       'cat_pages' => 20,
+                                       'cat_subcats' => 0,
+                                       'cat_files' => 3
+                               ],
                        ],
                        // batch 2
                        [
-                               (object)[ 'page_title' => 'Третья категория', 'page_id' => 3 ],
+                               (object)[
+                                       'page_title' => 'Третья категория',
+                                       'page_id' => 3,
+                                       'pp_propname' => null,
+                                       'cat_pages' => '0',
+                                       'cat_subcats' => '0',
+                                       'cat_files' => '0'
+                               ],
                        ]
                ];
        }