#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump.
If you have the `pattern` package installed, this module will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .
See scripts/process_wiki.py for a canned (example) script based on this
module.
"""
import bz2
import logging
import re
from xml.etree.cElementTree import iterparse # LXML isn't faster, so let's go with the built-in solution
import multiprocessing
from gensim import utils
# cannot import whole gensim.corpora, because that imports wikicorpus...
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus
logger = logging.getLogger('gensim.corpora.wikicorpus')
# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full preprocessing)
ARTICLE_MIN_WORDS = 50
RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE) # comments
RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE) # footnotes
RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description
RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE) # outside links
RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE) # math content
RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
# Remove File and Image template
RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
IGNORED_NAMESPACES = ['Wikipedia', 'Category', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft',
'WikiProject', 'Special', 'Talk']
def filter_wiki(raw):
"""
Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
or utf-8 encoded string.
"""
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.to_unicode(raw, 'utf8', errors='ignore')
text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0'
return remove_markup(text)
def remove_markup(text):
text = re.sub(RE_P2, "", text) # remove the last list (=languages)
# the wiki markup is recursive (markup inside markup etc)
# instead of writing a recursive grammar, here we deal with that by removing
# markup in a loop, starting with inner-most expressions and working outwards,
# for as long as something changes.
text = remove_template(text)
text = remove_file(text)
iters = 0
while True:
old, iters = text, iters + 1
text = re.sub(RE_P0, "", text) # remove comments
text = re.sub(RE_P1, '', text) # remove footnotes
text = re.sub(RE_P9, "", text) # remove outside links
text = re.sub(RE_P10, "", text) # remove math content
text = re.sub(RE_P11, "", text) # remove all remaining tags
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
# remove table markup
text = text.replace('||', '\n|') # each table cell on a separate line
text = re.sub(RE_P12, '\n', text) # remove formatting lines
text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
# remove empty mark-up
text = text.replace('[]', '')
if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations
break
# the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
# TODO is this really desirable?
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
return text
def remove_template(s):
"""Remove template wikimedia markup.
Return a copy of `s` with all the wikimedia markup template removed. See
http://meta.wikimedia.org/wiki/Help:Template for wikimedia templates
details.
Note: Since template can be nested, it is difficult remove them using
regular expresssions.
"""
# Find the start and end position of each template by finding the opening
# '{{' and closing '}}'
n_open, n_close = 0, 0
starts, ends = [], []
in_template = False
prev_c = None
for i, c in enumerate(iter(s)):
if not in_template:
if c == '{' and c == prev_c:
starts.append(i - 1)
in_template = True
n_open = 1
if in_template:
if c == '{':
n_open += 1
elif c == '}':
n_close += 1
if n_open == n_close:
ends.append(i)
in_template = False
n_open, n_close = 0, 0
prev_c = c
# Remove all the templates
s = ''.join([s[end + 1:start] for start, end in
zip(starts + [None], [-1] + ends)])
return s
def remove_file(s):
"""Remove the 'File:' and 'Image:' markup, keeping the file caption.
Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by
their corresponding captions. See http://www.mediawiki.org/wiki/Help:Images
for the markup details.
"""
# The regex RE_P15 match a File: or Image: markup
for match in re.finditer(RE_P15, s):
m = match.group(0)
caption = m[:-2].split('|')[-1]
s = s.replace(m, caption, 1)
return s
def tokenize(content):
"""
Tokenize a piece of text from wikipedia. The input string `content` is assumed
to be mark-up free (see `filter_wiki()`).
Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
that 15 characters (not bytes!).
"""
# TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
if 2 <= len(token) <= 15 and not token.startswith('_')]
def get_namespace(tag):
"""Returns the namespace of tag."""
m = re.match("^{(.*?)}", tag)
namespace = m.group(1) if m else ""
if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
raise ValueError("%s not recognized as MediaWiki dump namespace"
% namespace)
return namespace
_get_namespace = get_namespace
def extract_pages(f, filter_namespaces=False):
"""
Extract pages from a MediaWiki database dump = open file-like object `f`.
Return an iterable over (str, str, str) which generates (title, content, pageid) triplets.
"""
elems = (elem for _, elem in iterparse(f, events=("end",)))
# We can't rely on the namespace for database dumps, since it's changed
# it every time a small modification to the format is made. So, determine
# those from the first element we find, which will be part of the metadata,
# and construct element paths.
elem = next(elems)
namespace = get_namespace(elem.tag)
ns_mapping = {"ns": namespace}
page_tag = "{%(ns)s}page" % ns_mapping
text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
title_path = "./{%(ns)s}title" % ns_mapping
ns_path = "./{%(ns)s}ns" % ns_mapping
pageid_path = "./{%(ns)s}id" % ns_mapping
for elem in elems:
if elem.tag == page_tag:
title = elem.find(title_path).text
text = elem.find(text_path).text
ns = elem.find(ns_path).text
if filter_namespaces and ns not in filter_namespaces:
text = None
pageid = elem.find(pageid_path).text
yield title, text or "", pageid # empty page will yield None
# Prune the element tree, as per
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
# except that we don't need to prune backlinks from the parent
# because we don't use LXML.
# We do this only for <page>s, since we need to inspect the
# ./revision/text element. The pages comprise the bulk of the
# file, so in practice we prune away enough.
elem.clear()
_extract_pages = extract_pages # for backward compatibility
def process_article(args):
"""
Parse a wikipedia article, returning its content as a list of tokens
(utf8-encoded strings).
"""
text, lemmatize, title, pageid = args
text = filter_wiki(text)
if lemmatize:
result = utils.lemmatize(text)
else:
result = tokenize(text)
return result, title, pageid
[docs]class WikiCorpus(TextCorpus):
"""
Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump
can stay compressed on disk.
>>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
>>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
"""
[docs] def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
"""
Initialize the corpus. Unless a dictionary is provided, this scans the
corpus once, to determine its vocabulary.
If `pattern` package is installed, use fancier shallow parsing to get
token lemmas. Otherwise, use simple regexp tokenization. You can override
this automatic logic by forcing the `lemmatize` parameter explicitly.
"""
self.fname = fname
self.filter_namespaces = filter_namespaces
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
[docs] def get_texts(self):
"""
Iterate over the dump, returning text version of each article as a list
of tokens.
Only articles of sufficient length are returned (short articles & redirects
etc are ignored).
Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
>>> for vec in wiki_corpus:
>>> print(vec)
"""
articles, articles_all = 0, 0
positions, positions_all = 0, 0
texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
pool = multiprocessing.Pool(self.processes)
# process the corpus in smaller chunks of docs, because multiprocessing.Pool
# is dumb and would load the entire input into RAM at once...
for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10):
articles_all += 1
positions_all += len(tokens)
# article redirects and short stubs are pruned here
if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
continue
articles += 1
positions += len(tokens)
if self.metadata:
yield (tokens, (pageid, title))
else:
yield tokens
pool.terminate()
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
self.length = articles # cache corpus length
# endclass WikiCorpus