ResourceType "MediaWikiPage"

From dataspects::Wiki
C1544346692
Jump to navigation Jump to search



Feeding

Source

https://github.com/dataspects/DataspectsMediaWikiFeeder

Entities (Namespace 0, 10, 106, 102)

Destination

$wgDataspectsApiURL/mediawikis/$wgMediaWikiMongoID/pages

NativeMongoDoc

    $mongoDoc = array(
      "slug" => "pending",
      "resourceSiloType" => "pending",
      "resourceSiloLabel" => "pending",
      "resourceSiloID" => "pending",
      "resourceType" => "MediaWikiPage",
      "pagename" => $this->title->mTextform,
      // Do we want the index.php?title= form here?
      "rawUrl" => $this->title->getInternalURL(),
      "shortUrl" => $this->title->getFullURL(),
      "namespace" => $this->getNamespace($this->title->mNamespace),
      "full" => array(
        "wikitext" => $this->wikitext,
        "text" => "NOT USED BECAUSE NO TIKA HERE",
        "html" => $this->parsedWikitext
      ),
      "categories" => $this->categories,
      "annotations" => $this->annotations,
      "feederClass" =>"DataspectsMediaWikiFeeder"
    );

              $this->categories[] = $category->mTextform;

              # Only Namespace 0
              $this->annotations[] = array(
                'subject' => $this->title->mTextform,
                'predicate' => $propertyName,
                'object' => array(
                  'source' => str_replace('#0##', '', $object['item']),
                  'html' => '',
                  'text' => ''
                )
              );

Predicates (Namespace 102)

Destination

$wgDataspectsApiURL/predicates

NativeMongoDoc

    $predicateMongodoc = array(
      "predicate" => $this->title->mTextform,
      "predicateType" => $this->annotations['_TYPE']['object'],
      "predicateClass" => $this->annotations['HasPredicateClass']['object'],
      "predicateNamespace" => $this->getNamespace($this->title->mNamespace),
      "predicateCategories" => $this->categories
    );

    $this->categories[] = $category->mTextform;

Indexing

Entities

Destination

process.env.ES_NODE
routing: document.id
index: process.env.DEFAULT_ES_INDEX

NativeESDoc

      let entityDoc = {
        MyType: 'entity',
        // Resource silo level
        OriginatedFromResourceSiloID: document.resourceSiloID,
        OriginatedFromResourceSiloLabel: document.resourceSiloLabel,
        OriginatedFromResourceSiloType: document.resourceSiloType,
        // Resource level
        OriginatedFromResourceName: document.pagename,
        OriginatedFromResourceURL: document.rawUrl,
        OriginatedFromResourceType: document.resourceType,
        // Entity/subject level
        HasEntityClass: "",
        HasEntityName: document.pagename,
        HasEntityType: theHasEntityType,
        HasEntityURL: document.rawUrl,
        HasEntityTitle: hasEntityTitle,
        HasEntityBlurbTEXT: hasEntityBlurb,
        HasEntityBlurbHTML: "",
        HasEntityContentSOURCE: document.full.wikitext,
        HasEntityContentTEXT: entityContentText,
        HasEntityContentHTML: html
      }

theHasEntityType

      const termOntEntityTypes = ["Template", "Form"];
      var theHasEntityType = hasEntityType;
      if (termOntEntityTypes.includes(document.namespace)) theHasEntityType = document.namespace;

DirectESField HasEntityTitle

is populated by the corresponding mediawikipage's NativeResourceAnnotation "HasEntityTitle".

      var hasEntityTitle = "";
      if (theAnnotation.predicate == "HasEntityTitle") hasEntityTitle = theAnnotation.object.source;

DirectESField HasEntityBlurbTEXT

is populated by the corresponding mediawikipage's NativeResourceAnnotation "HasEntityBlurb".

      var hasEntityBlurb = "";
      if (theAnnotation.predicate == "HasEntityBlurb") hasEntityBlurb = theAnnotation.object.source;

DirectESField HasEntityContentTEXT

is populated by entityContentText from

      $ = cheerio.load(document.full.html);
      $('#dsMETADATA').remove();
      html = $("body").html())
      text = response.data[0]['X-TIKA:content']; // from html
      entityContentText = text.replace(/\r?\n|\r/g, ' ').replace(/\s\s+/g

DirectESField HasEntityContentHTML

is populated by html from

      $ = cheerio.load(document.full.html);
      $('#dsMETADATA').remove();
      html = $("body").html())

Annotations

Destination

process.env.ES_NODE
routing: theAnnotation.mediaWikiPage.toString()
index: process.env.DEFAULT_ES_INDEX

NativeResourceAnnotations

A mediawikipage's NativeResourceAnnotations are looked up by Annotation.find({ mediaWikiPage: job.data.pageId }.

NativeESDoc

          let annotationDoc = {
            MyType: {
              name: 'annotation',
              parent: theAnnotation.mediaWikiPage
            },
            subject: theAnnotation.subject,
            predicate: theAnnotation.predicate,
            objectSOURCE: theAnnotation.object.source,
            objectHTML: theAnnotation.object.html,
            objectTEXT: theAnnotation.object.text
          };