User:GreenC/software/search wikipedia

Method to accurately search Wikipedia[edit]

Find all articles which contain the string "sportsillustrated.cnn.com" AND a {{dead}} template AND < whatever > .. solving for complicated Wikipedia searches is trivial by downloading the Wikipedia database (dumps.wikimedia.org) and search using whatever tool you prefer. Here are two plug and play solutions.

Awk[edit]

Awk is probably the simplest language available though with a speed trade-off for lack of a real XML parser. Nevertheless, no additional software is required (awk is a POSIX tool).

To run: awk -f search-wp.awk > out

#!/bin/awk -f
# Search entire Wikipedia database. 
# Download: https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia
#

BEGIN {

  MySearch = "archive.org/w?e?b?/?[0-9]{1,14}/"
  WPdump = "/f/t/wikipedia-dump/enwiki-20150515-pages-articles.xml"

  RS=("<page|</page>")

  while ((getline rawstr < WPdump ) > 0) {

    # Skip blank content
    if(! gensub(/^[[:space:]]+|[[:space:]]+$/, "", "g", rawstr))    
      continue

    # Convert XML formating
    gsub(/&amp;lt;/,"<",rawstr);gsub(/&amp;gt;/,">",rawstr);gsub(/&amp;quot;/,"\"",rawstr);gsub(/&amp;amp;/,"\\&",rawstr)

    # Get article title
    if ( match(rawstr, "<title>.+</title>", a) ) {
      split(a[0], b, "(<title>|</title>)")
      title = b[2]
    }

    # Get article body
    if ( match(rawstr, "<text xml:space=\"preserve\">.+</text>", a) ) {
      split(a[0], b, "(<text xml:space=\"preserve\">|</text>)")
      body = b[2]
    }

# ---------- Search -----

    if ( match(body, MySearch, matched_text) ) {
      print title 
      # print matched_text[0]    # uncomment to print 
      continue
    }
  }
  close(r)
}

Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.

Nim[edit]

For a faster solution here is a Nim example. Nim compiles to optimized C code, which then compiles using gcc to an executable binary. In a test between Awk and Nim, it took Awk 3m31s to complete a search, the same in Nim took 0m43s. The code below is pretty much copy-paste compile and run, just add your RegEx Perl compatible regex, or plain text. Example regex strings:

mySearchRe = re"djvu[.]txt"

mySearchRe = re"http[:][^ ]*[^ ]"

(the regex string is wrapped by re"" )

Then download Nim compiler (choosenim method is easiest), and compile the source with nim c -d:release --opt:speed -d:danger --passC:"-flto" --passL:"-flto" search.nim.

#
# Search wikipedia dump for a string and print the article title (or matched text) if located
#  Credit: Copyright User:Green_Cardamom, April 2016, MIT License 
#  Language: Nim
#  Additional code credits: Rob Speer (https://github.com/rspeer/wiki2text)
#

import re, options, strutils, os, streams, parsexml

var                     # configuration variables
    mySearchRe = re"djvu[.]txt"
    wpDump = "/mnt/WindowsFdriveTdir/wikipedia-dump/enwiki-20150901-pages-articles.xml"
    maxCount = 0        # Stop searching after X countArticle for speed testing. Set to 0 to find all.

var 
    countAllArticle = 0 # All article count
    countArticle = 0    # Article titles containing a match (any number of matches)
    countHits = 0       # Number of matches of search pattern (running total)

type
    TagType = enum
        TITLE, TEXT, REDIRECT, NS
    ArticleData = array[TagType, string]

#
# Search text
#
proc searchText(article: ArticleData): bool {.discardable.} =
  var
    artcount = 0
    pos = -1
    # matches = newSeq[string](1)

  inc countAllArticle

  while pos < article[TEXT].len:
    pos = find(article[TEXT], mySearchRe, pos + 1)
    if pos == -1: break
    inc artcount

  if artcount > 0:
    inc countArticle      # number of article titles matching
    countHits += artcount # number of matches of search pattern
    echo article[TITLE]
    result = true

  if maxCount > 0:
    if countAllArticle >= maxCount:
      echo ""
      echo "Articles all: ", countAllArticle
      echo "Articles with a match: ", countArticle
      echo "Number of pattern matches: ", countHits
      quit()

var
  RELEVANT_XML_TAGS = ["title", "text", "ns"]
  textBuffer = ""
  s = newFileStream(wpDump, fmRead)
  gettingText = false
  gettingAttribute = false
  article: ArticleData
  xml: XmlParser

if s == nil: quit("cannot open the file " & wpDump)
for tag in TITLE..NS: article[tag] = ""
xml.open(s, wpDump, options={reportWhitespace})

while true:
    # Scan through the XML, handling each token as it arrives.
    xml.next()
    case xml.kind
    of xmlElementStart, xmlElementOpen:
      if RELEVANT_XML_TAGS.contains(xml.elementName):
        # If this is a "title", "text", or "ns" tag, prepare to get its
        # text content. Move our writing pointer to the beginning of
        # the text buffer, so we can overwrite what was there.
        textBuffer.setLen(0)
        gettingText = true
      elif xml.elementName == "page":
        # If this is a new instance of the <page> tag that contains all
        # these tags, then reset the value that won't necessarily be
        # overridden, which is the redirect value.
        article[REDIRECT].setLen(0)
      elif xml.elementName == "redirect":
        # If this is the start of a redirect tag, prepare to get its
        # attribute value.
        gettingAttribute = true
    of xmlAttribute:
      # If we're looking for an attribute value, and we found one, add it
      # to the buffer.
      if gettingAttribute:
        textBuffer.add(xml.attrValue)
    of xmlCharData, xmlWhitespace:
      # If we're looking for text, and we found it, add it to the buffer.
      if gettingText:
        textBuffer.add(xml.charData)
    of xmlElementEnd:
      # When we reach the end of an element we care about, take the text
      # we've found and store it in the 'article' data structure. We can
      # accomplish this quickly by simply swapping their references.
      case xml.elementName
      of "title":
        swap article[TITLE], textBuffer
      of "text":
        swap article[TEXT], textBuffer
      of "redirect":
        swap article[REDIRECT], textBuffer
      of "ns":
        swap article[NS], textBuffer
      of "page":
        # When we reach the end of the <page> tag, send the article
        # data to searchText().
        searchText(article)
      else:
        discard

      # Now that we've reached the end of an element, stop extracting
      # text. (We'll never need to extract text from elements that can
      # have other XML elements nested inside them.)
      gettingText = false
      gettingAttribute = false

    of xmlEof:
      break

    else:
      discard
xml.close

echo "Search Wikipedia completed" 
echo "----" 
echo "Articles all: ", countAllArticle
echo "Articles with a match: ", countArticle
echo "Number of pattern matches: ", countHits

Note: when redirecting large output, send to a different disk (ramdisk or other physical volume) otherwise it could slow reading the XML file.