From Wikipedia, the free encyclopedia
#!/usr/bin/python
# -*- encoding:utf-8 -*-
#
# taxoconvert.py -- convert multi-template taxoboxes to single template
import codecs
import getopt
import os
import pickle
import re
import sys
import tempfile
import wikipedia
global checks, edit, debug
site = wikipedia.Site('en')
checks = True
edit = False
debug = False
class Error(Exception):
def __init__(self, text):
self.text = text
def __str__(self):
return self.text
class NoError(Error):
None
def edittext(s):
fn = tempfile.mktemp()
f = codecs.open(fn, 'w', 'utf-8')
f.write(s)
f.close()
os.system('%s "%s"' % (os.getenv('EDITOR', 'vi'), fn))
f = codecs.open(fn, 'r', 'utf-8')
s = f.read()
f.close()
return s
def canonize(s):
return filter(lambda c: c.isalnum(), s).lower()
def check(text, newtext):
if not checks:
return newtext
while 1:
wikipedia.showDiff(text, newtext)
i = wikipedia.input(u'OK? [yNeq]')
if i == 'q':
raise IOError
elif i == 'y':
return newtext
elif i == 'e':
newtext = edittext(newtext)
else:
return None
def record(params, key, value):
if debug:
wikipedia.output(u"%s = %s" % (key, value))
if params.has_key(key):
raise Error(u"Duplicate key %s" % key)
if value:
params['sequence'].append(key)
params[key] = value
def parse_nomial(suffix, n, lines, params):
if debug:
wikipedia.output(u"parse_nomial: suffix = '%s', lines[n] = %s" % (suffix, lines[n]))
orig_n = n
found = False
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
r'(?:[ _]+(?:simple|botany|parens))? *\| *'
r'color *= *[a-z]+ *\| *'
r'\1_name *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
if m:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
n += 1
found = True
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
r'(?:[ _]+part)? *\| *'
r'(?:color *= *[a-z]+ *\| *)?'
r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *([^\}]*[^\} ]) *\| *'
r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
if m and not found:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
record(params, '%s%s_authority' % (m.group(1), suffix),
'%s, %s' % (m.group(3), m.group(4)))
n += 1
found = True
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial) *\| *'
r'color *= *[a-z]+ *\| *'
r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *([^\}]*[^\} ]) *\| *'
r'date *= *}}$', lines[n])
if m and not found:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3))
n += 1
found = True
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
r'(?:[ _]+(?:parens|botany|simple))? *\| *'
r'color *= *[a-z]+ *\| *'
r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *\| *'
r'date *= *}}$', lines[n])
if m and not found:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
n += 1
found = True
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
r'[ _]+parens(?:[ _]+part)? *\| *'
r'(?:color *= *[a-z]+ *\| *)?'
r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *([^\}]*[^\} ]) *\| *'
r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
if m and not found:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
record(params, '%s%s_authority' % (m.group(1), suffix),
'(%s, %s)' % (m.group(3), m.group(4)))
n += 1
found = True
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
r'[ _]+botany *\| *'
r'color *= *[a-z]+ *\| *'
r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *([^\}]*[^\} ]|) *}}$', lines[n])
if m and not found:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3))
n += 1
found = True
if n + 1 < len(lines):
m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
r'[ _]+botany *\| *'
r'color *= *[a-z]+ *\| *'
r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *([^\}]*[^\} ]|) *}}$', lines[n] + lines[n+1])
if m and not found:
record(params, m.group(1) + suffix, "''%s''" % m.group(2))
record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3))
n += 2
found = True
m = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *'
r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *'
r'(?:\| *([0-9]+px))?(?:\|[^\]]*)?\]\] *\| *'
r'caption *= *([^\}]*[^\} ]|) *}}$', lines[n])
if m and re.search(r'(?i)(?:range|distribution)', lines[n]):
record(params, 'range_map%s' % suffix, m.group(1))
record(params, 'range_map%s_width' % suffix, m.group(2))
record(params, 'range_map%s_caption' % suffix, m.group(3))
n += 1
return (n, orig_n != n)
def parse(text, linkname):
"""parse(text, linkname) -- parse multi-template taxobox from 'text' and
return it as a dictionary suitable for constructing a taxobox
template."""
params = {'sequence': []}
text = re.sub(r'(?m)[ \t\r]+$', '', text)
if 1 < len(re.findall(r'(?i){{taxobox[ _]+begin *\|', text)):
raise Error(u"Two occurrences of {{taxobox begin}}.")
if 1 < len(re.findall(r'(?i){{taxobox[ _]+end *}}', text)):
raise Error(u"Two occurrences of {{taxobox end}}.")
m = re.search(r'(?is){{taxobox[ _]+begin.*{{taxobox[ _]+end *}}', text)
if not m:
global done
done[linkname] = True
raise NoError(u"Can't find taxobox.")
lines = re.split(r'(?: *(?:</?br */?>(?= *(?:{{|<))|\n) *)+', m.group(0))
n = 0
m1 = re.match(r'(?i){{taxobox[ _]+begin *\| *color *= *([a-z]+) *\| *'
'name *= *(.*[^ ]) *}}[ \t]*(?:<br */?> *)?$', lines[n])
m2 = re.match(r'(?i){{taxobox[ _]+begin *\| *name *= *(.*[^ ]) *\| *'
'color *= *([a-z]+) *}}[ \t]*(?:<br */?> *)?$', lines[n])
if m1:
record(params, 'color', m1.group(1))
record(params, 'name', m1.group(2))
n += 1
elif m2:
record(params, 'color', m2.group(2))
record(params, 'name', m2.group(1))
n += 1
else:
raise Error(u"Can't find {{taxobox begin}}: %s" % lines[n])
m = re.match(r'(?i){{(?:template:)?(status[^\}]+)}}', lines[n])
if m:
record(params, 'status', '{{%s}}' % m.group(1))
n += 1
m = re.match(r'(?i)(?:<small> *)?fossil +(?:range|record): +([^<\n]*[^<\n ]) *'
r'(?:</small>)?', lines[n])
if m:
record(params, 'fossil_range', m.group(1))
n += 1
if re.match(r'(?i)<!--.*-->', lines[n]):
n += 1
image_re = (r'(?i){{taxobox[ _]+image *\| *image *= *'
r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *'
r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *\| *'
r'caption *= *([^\}]*[^\} ]|) *}}$')
m1 = re.match(image_re, lines[n])
m2 = re.match(image_re, lines[n] + lines[n+1])
m3 = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *'
r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *'
r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *}}$', lines[n])
if m1:
record(params, 'image', m1.group(1))
record(params, 'image_width', m1.group(2))
record(params, 'image_caption', m1.group(3))
n += 1
elif m2:
record(params, 'image', m2.group(1))
record(params, 'image_width', m2.group(2))
record(params, 'image_caption', m2.group(3))
n += 2
elif m3:
record(params, 'image', m3.group(1))
record(params, 'image_width', m3.group(2))
n += 1
m = re.match(image_re, lines[n])
if m:
record(params, 'image2', m.group(1))
record(params, 'image2_width', m.group(2))
record(params, 'image2_caption', m.group(3))
n += 1
if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *\| *caption *= *}}$',
lines[n]):
n += 1
if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *(?:|\|.*)}}$',
lines[n]):
n += 1
if re.match(r'(?i){{taxobox[ _]+image.*(?:Image with unknown copyright status removed|Unsourced image removed)', lines[n]):
n += 1
if re.match(r'(?i)<!--.*-->', lines[n]):
n += 1
if re.match(r'(?is)<!--.*-->', lines[n] + lines[n+1]):
n += 2
m = re.match(r'(?i){{taxobox[ _]+begin[ _]+placement *\| *'
r'color *= *[a-z]+ *}}$', lines[n])
if not m:
raise Error(u"Can't find {{taxobox begin placement}}: %s"
% lines[n])
n += 1
while n < len(lines):
m0 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *'
r'taxon *= *([^\}]*[^\} ]) *'
r'<small>(.*)</?small>}}$', lines[n] + lines[n+1])
if m0:
record(params, m0.group(1), m0.group(2))
record(params, m0.group(1) + '_authority', m0.group(3))
n += 2
continue
m1 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *'
r'taxon *= *([^\}]*[^\} ]) *}}(?:<br */?>)?$', lines[n])
if not m1:
break
record(params, m1.group(1), m1.group(2))
n += 1
m2 = re.match(r'(?i){{taxobox[ _]+authority *\| *'
r'author *= *([^\}]*[^\} ]) *\| *'
r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
if m2:
record(params, m1.group(1) + '_authority',
'%s, %s' % (m2.group(1), m2.group(2)))
n += 1
continue
m3 = re.match(r'(?i){{taxobox[ _]+authority[ _]+parens *\| *'
r'author *= *([^\}]*[^\} ]) *\| *'
r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
if m3:
record(params, m1.group(1) + '_authority',
'(%s, %s)' % (m3.group(1), m3.group(2)))
n += 1
continue
m4 = re.match(r'(?i){{taxobox[ _]+authority[ _]+(?:new|botany)? *\| *'
r'author(?:ity)? *= *([^\}]*[^\} ]) *}}$', lines[n])
if m4:
record(params, m1.group(1) + '_authority', m4.group(1))
n += 1
continue
m5 = re.match(r'(?i)<small> *(.*[^ ]) *(?:</?small>)?', lines[n])
if m5:
record(params, m1.group(1) + '_authority', m5.group(1))
n += 1
continue
m = re.match(r'(?i){{taxobox[ _]+end[ _]+placement(?: *\| *color *= *[a-z]+ *)?}}$', lines[n])
if not m:
raise Error(u"Expected {{taxobox end placement}}: %s"
% lines[n])
n += 1
n, found = parse_nomial('', n, lines, params)
if found:
n, found = parse_nomial('2', n, lines, params)
if found:
n, found = parse_nomial('3', n, lines, params)
if found:
n, found = parse_nomial('4', n, lines, params)
m = re.match(r'(?i){{taxobox[ _]+section[ _]+type[ _]+species *\| *'
r'color *= *[a-z]+ *\| *'
r'species *= *([^\}]*[^\} ]) *\| *'
r'comment *= *([^\}]*[^\} ]|) *}}$', lines[n])
if m:
record(params, 'type_species', "''%s''" % m.group(1))
record(params, 'type_species_authority', m.group(2))
n += 1
if re.match(r'(?i)<!--.*-->', lines[n]):
n += 1
m = re.match(r'(?i){{taxobox[ _]+begin[ _]+synonyms *\| *'
r'color *= *[a-z]+ *}}$', lines[n])
if m:
n += 1
syn = []
while 1:
m1 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+simple'
r' *\| *binomial_name *= *([^\}]*[^\} ]) *}}$',
lines[n])
m2 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+botany'
r' *\| *binomial_name *= *([^\}]*[^\} ]) *\| *'
r'author *= *([^\}]*[^\} ]) *}}$',
lines[n])
m3 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry *\| *'
r'binomial_name *= *([^\|\}]*[^\|\} ]) *\| *'
r'author *= *([^\}]*[^\} ]) *\| *'
r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
if m1:
syn.append("''%s''" % m1.group(1))
elif m2:
syn.append("''%s'' <small>%s</small>"
% (m2.group(1), m2.group(2)))
elif m3:
syn.append("''%s'' <small>%s, %s</small>"
% (m3.group(1), m3.group(2), m3.group(3)))
else:
break
n += 1
record(params, 'synonyms', '<br/>'.join(syn))
m = re.match(r'(?i){{taxobox[ _]+end[ _]+synonyms}}$', lines[n])
if not m:
raise Error(u"Expected {{taxobox synonyms end}} but found: %s"
% lines[n])
n += 1
if not params.has_key('binomial') and not params.has_key('trinomial'):
n, found = parse_nomial('', n, lines, params)
m = re.match(r'(?i){{taxobox[ _]+section[ _]+diversity *\| *'
r'color *= *[a-z]+ *\| *'
r'link *= *([^\}]*[^\} ]) *\| *'
r'diversity *= *([^\}]*[^\} ]) *}}$', lines[n])
if m:
record(params, 'diversity', m.group(2))
record(params, 'diversity_link', m.group(1))
n += 1
m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *'
r'color *= *[a-z]+ *\| *'
r'plural_taxon *= *([^\}]*[^\} ]) *}}$', lines[n])
if not m:
m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *'
r'plural_taxon *= *([^\}]*[^\} ]) *\| *'
r'color *= *[a-z]+ *}}$', lines[n])
if m:
record(params, 'subdivision_ranks', m.group(1))
n += 1
m = n
while not re.match(r'(?i){{taxobox', lines[n]):
n += 1
record(params, 'subdivision', '\n' + '\n'.join(lines[m:n]))
if re.match(r'(?i)<!--.*-->', lines[n]):
n += 1
if n + 1 < len(lines) and re.match(r'(?i)<!--.*-->', lines[n] + lines[n+1]):
n += 2
m = re.match(r'(?i){{taxobox[ _]+end *}}$', lines[n])
if not m:
raise Error(u"Unrecognized line: %s" % lines[n])
# Some other checks
if params.has_key('norank'):
raise Error(u"Can't handle {{taxobox norank entry}}, sorry.")
if params.has_key('unranked'):
raise Error(u"Can't handle {{taxobox unranked entry}}, sorry.")
# Fix some simple mistakes.
if (params.has_key('genus') and params.has_key('name')
and params['genus'] == "'''''%s'''''" % params['name']):
params['name'] = "''%s''" % params['name']
if (params.has_key('binomial') and params.has_key('name')
and params['binomial'] == "''%s''" % params['name']):
params['name'] = "''%s''" % params['name']
if (params.has_key('trinomial') and params.has_key('name')
and params['trinomial'] == "''%s''" % params['name']):
params['name'] = "''%s''" % params['name']
if (params.has_key('image_caption')
and canonize(params['image_caption'])
in (canonize(params.get('name', '')),
canonize(params.get('binomial', '')),
canonize(params.get('trinomial', '')),
canonize(params.get('genus', '')) + 'sp',
canonize(params.get('name', '') + params.get('binomial', '')),
)):
del params['image_caption']
if params.has_key('binomial_authority'):
params['binomial_authority'] = re.sub(r',,', ',',
params['binomial_authority'])
if params.has_key('trinomial_authority'):
params['trinomial_authority'] = re.sub(r',,', ',',
params['trinomial_authority'])
if params.has_key('genus') and re.match(r"'''''[[.*]]'''''$", params['genus']):
params['genus'] = params['genus'][3:-3]
if params.has_key('name'):
m = re.match(r"<center> *(.*[^ ]) *</center>$", params['name'])
if m:
params['name'] = m.group(1)
if params.has_key('subdivision_ranks'):
m = re.match(r"<center> *(.*[^ ]) *</center>$", params['subdivision_ranks'])
if m:
params['subdivision_ranks'] = m.group(1)
if params.has_key('genus') and re.match(r"(''')?[^']+\1$", params['genus']):
params['genus'] = "''%s''" % params['genus']
if params.has_key('species') and re.match(r"(''')?[^']+\1$", params['species']):
params['species'] = "''%s''" % params['species']
if params.has_key('subspecies') and re.match(r"(''')?[^']+\1$", params['subspecies']):
params['subspecies'] = "''%s''" % params['subspecies']
if params.has_key('species') and params.has_key('binomial') and re.match(r"''[^']+''$", params['species']):
params['species'] = "'''%s'''" % params['species']
if params.has_key('subspecies') and params.has_key('trinomial') and re.match(r"''[^']+''$", params['subspecies']):
params['subspecies'] = "'''%s'''" % params['subspecies']
if params.has_key('subdivision') and canonize(params['subdivision']) == 'seetext':
params['subdivision'] = '\nSee text.'
if (params.has_key('binomial') and params.has_key('species')
and re.match("'''''[^']*'''''$", params['species'])):
m = re.match(r"'*([A-Z])[a-z-]* ([a-z-]*)'*", params['binomial'])
if m:
params['species'] = "'''''%s. %s'''''" % (m.group(1), m.group(2))
if (params.has_key('trinomial') and params.has_key('subspecies')
and re.match("'''''.*'''''$", params['subspecies'])):
m = re.match(r"'*([A-Z])[a-z-]* ([a-z])[a-z-]* ([a-z][a-z-]*)'*", params['trinomial'])
if m:
params['subspecies'] = "'''''%s. %s. %s'''''" % (m.group(1), m.group(2), m.group(3))
return params
def convert(pl):
text = pl.get()
if edit:
text = edittext(text)
params = parse(text, pl.title())
newtext = re.sub(r'(?is){{taxobox[ _]+begin *\|.*{{taxobox[ _]+end *}}',
'{{Taxobox\n'
+ ''.join(map(lambda k: '| %s = %s\n' % (k, params[k]),
filter(lambda s: params.has_key(s),
params['sequence'])))
+ '}}', text)
newtext = check(pl.get(), newtext)
if newtext:
status, reason, data = pl.put(newtext, u'nomialbot — converted multi-template taxobox to {{Taxobox}}')
global done
if data == '':
done[pl.title()] = True
def convertmany():
global site, n, linknames, done
pages = map(lambda l: wikipedia.Page(site, l), linknames)
fetched = []
while n < len(linknames):
try:
if not done.get(linknames[n]):
if linknames[n] not in fetched:
tofetch = filter(lambda p: not done.get(p.title()), pages[n:])[:50]
wikipedia.getall(site, tofetch)
fetched += map(lambda p: p.title(), tofetch)
wikipedia.output("Trying %s" % linknames[n])
if pl.namespace() != 0:
done[pl.title()] = True
wikipedia.output(u"%s not in main namespace" % pl.title())
else:
convert(pages[n])
except wikipedia.LockedPage:
wikipedia.output("%s is locked" % linknames[n])
except wikipedia.IsRedirectPage:
wikipedia.output("%s is redirect" % linknames[n])
done[linknames[n]] = True
except NoError:
None
except Error, e:
wikipedia.output(u'***' + e.text)
n += 1
def main():
global checks, edit, debug
offset = None
reload = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'r:dneo:',
['reload=', 'debug', 'no-checks', 'edit', 'offset='])
for o, a in opts:
if o in ('-n', '--no-checks'):
checks = False
elif o in ('-o', '--offset'):
offset = int(a)
elif o in ('-e', '--edit'):
edit = True
elif o in ('-d', '--debug'):
debug = True
elif o in ('-r', '--reload'):
reload = a
else:
print "Bad option: %s" % o
return
except getopt.GetoptError:
print "Bad command line"
return
global n, linknames, done
done = {}
try:
f = file('taxoconvert.db', 'rb')
n, linknames, done = pickle.load(f)
f.close()
if reload:
raise IOError
except IOError:
tb = wikipedia.Page(site, reload)
linknames = map(lambda p:p.title(), tb.getReferences())
print len(linknames), "pages found"
n = 0
try:
if offset != None:
n = offset
if args:
for aa in args:
convert(wikipedia.Page(site, aa))
else:
convertmany()
finally:
f = file('taxoconvert.db.new', 'wb')
pickle.dump((n, linknames, done), f)
f.close()
os.rename('taxoconvert.db.new', 'taxoconvert.db')
if __name__ == '__main__':
try:
main()
finally:
wikipedia.stopme()