Da Wikipedia, l'enciclopedia libera.
# -*- coding: utf-8 -*-
#
# (C) Daniel Herding, 2004
#
# Distributed under the terms of the MIT license.
#
__version__='$Id: replace.py,v 1.102 2006/03/12 16:35:54 wikipedian Exp $'
from __future__ import generators
import sys, re
import wikipedia, pagegenerators, catlib, config
# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.
msg = {
'de':u'Bot: Automatisierte Textersetzung %s',
'en':u'Robot: Automated text replacement %s',
'es':u'Robot: Reemplazo automático de texto %s',
'fr':u'Bot : Remplacement de texte automatisé %s',
'hu':u'Robot: Automatikus szövegcsere %s',
'ia':u'Robot: Reimplaciamento automatic de texto %s',
'is':u'Vélmenni: breyti texta %s',
'it':u'Bot: Sostituzione automatica del testo %s',
'ka':u'რáƒáƒ‘áƒáƒ¢áƒ˜: ტექსტის áƒáƒ•áƒ¢áƒáƒ›áƒáƒ¢áƒ£áƒ ი შეცვლრ%s',
'lt':u'Botas: Automatinis teksto keitimas %s',
'pt':u'Bot: Mudança automática %s',
'sr':u'Бот: ÐутоматÑка замена текÑта %s',
}
# Predefined replacements tasks.
fixes = {
# per it.wikipedia
'accenti': {
'regex': True,
'msg': {
'it':u'Correzione di alcuni errori comuni contenuti in questa [[Discussioni_Wikipedia:Bot/Sostituzioni/Espressioni_regolari|lista]]',
},
'replacements': [
#congiunzioni (e non) terminanti in -chè
(u'\\b([Aa])bbenchè\\b', ur'\1bbenché'),
(u'\\b([aA])cciocchè\\b', ur'\1cciocché'),
(u'\\b([aA])ffinchè\\b', ur'\1ffinché'),
(u'\\b([aA])lcunchè\\b', ur'\1lcunché'),
(u'\\b([aA])llorchè\\b', ur'\1llorché'),
(u'\\b([aA])ltrochè\\b', ur'\1ltroché'),
(u'\\b([aA])ncorchè\\b', ur'\1ncorché'),
(u'\\b([aA])nzichè\\b', ur'\1nziché'),
(u'\\b([aA])ttesochè\\b', ur'\1ttesoché'),
(u'\\b([aA])vvegnachè\\b', ur'\1vvegnaché'),
(u"\\b([aA])vvegnadiochè\\b", ur"\1vvegnadioché"),
(u"\\b([aA])vvengachè\\b", ur"\1vvengaché"),
(u"\\b([aA])vvengadiochè\\b", ur"\1vvengadioché"),
(u'\\b([bB])enchè\\b', ur'\1enché'),
(u'\\b([cC])hecchè\\b', ur'\1hecché'),
(u"\\b([cC])iocchè\\b", ur"\1iocché"),
(u'\\b([cC])omecchè\\b', ur'\1omecché'),
(u"\\b([cC])onciofossechè\\b", ur"\1onciofosseché"),
(u'\\b([cC])ontuttochè\\b', ur'\1ontuttoché'),
(u'\\b([cC])osicchè\\b', ur'\1osicché'),
(u'\\b([cC])otalchè\\b', ur'\1otalché'),
(u'\\b([dD])acchè\\b', ur'\1acché'),
(u'\\b([dD])appoichè\\b', ur'\1appoiché'),
(u'\\b([dD])imodochè\\b', ur'\1imodoché'),
(u"\\b([dD])opochè\\b", ur"\1opoché"),
(u"\\b([dD])opodichè\\b", ur"\1opodiché"),
(u'\\b([eE])ssendochè\\b', ur'\1ssendoché'),
(u'\\b([fF])inattantochè\\b', ur'\1inattantoché'),
(u'\\b([fF])inchè\\b', ur'\1inché'),
(u'\\b([fF])intantochè\\b', ur'\1intantoché'),
(u"\\b([fF])inacchè\\b", ur"\1inacché"),
(u"\\b([fF])inattantochè\\b", ur"\1inattantoché"),
(u'\\b([fF])uorchè\\b', ur'\1uorché'),
(u'\\b([gG])iacchè\\b', ur'\1iacché'),
(u'\\b([gG])ranchè\\b', ur'\1ranché'),
(u"\\b([gG])iafossechè\\b", ur"\1iafosseché"),
(u"\\b([gG])iafossecosachè\\b", ur"\1iafossecosaché"),
(u"\\b([iI])nfinattantochè\\b", ur"\1nfinattantoché"),
(u'\\b([lL])orchè\\b', ur'\1orché'),
(u"\\b([iI])nquantochè\\b", ur"\1nquantoché"),
(u'\\b([mM])acchè\\b', ur'\1acché'),
(u'\\b([nN])onchè\\b', ur'\1onché'),
(u"\\b([nN])onsochè\\b", ur"\1onsoché"),
(u'\\b([oO])ltrechè\\b', ur'\1ltreché'),
(u"\\b([oO])ndechè\\b", ur"\1ndeché"),
(u'\\b([pP])erchè\\b', ur'\1erché'),
(u'\\b([pP])erciocchè\\b', ur'\1erciocché'),
(u'\\b([pP])erlochè\\b', ur'\1erloché'),
(u'\\b([pP])erocchè\\b', ur'\1erocché'),
(u'\\b([pP])oichè\\b', ur'\1oiché'),
(u"\\b([pP])osciachè\\b", ur"\1osciaché"),
(u'\\b([pP])ressochè\\b', ur'\1ressoché'),
(u'\\b([pP])urchè\\b', ur'\1urché'),
(u"\\b([qQ])uantochè\\b", ur"\1uantoché"),
(u"\\b([qQ])uasichè\\b", ur"\1uasiché"),
(u"\\b([sS])econdochè\\b", ur"\1econdoché"),
(u'\\b([sS])ennonchè\\b', ur'\1ennonché'),
(u'\\b([sS])enonchè\\b', ur'\1enonché'),
(u'\\b([sS])icchè\\b', ur'\1icché'),
(u'\\b([sS])inattantochè\\b', ur'\1inattantoché'),
(u'\\b([sS])inchè\\b', ur'\1inché'),
(u'\\b([sS])intantochè\\b', ur'\1intantoché'),
(u"\\b([sS])tantechè\\b", ur"\1tanteché"),
(u'\\b([tT])alchè\\b', ur'\1alché'),
(u"\\b([tT])almentechè\\b", ur"\1almenteché"),
(u'\\b([tT])antochè\\b', ur'\1antoché'),
(u"\\b([tT])rannechè\\b", ur"\1ranneché"),
(u'\\b([tT])uttochè\\b', ur'\1uttoché'),
#passati remoti
(u"\\b([aA])bbattè\\b", ur"\1bbatté"),
(u"\\b([aA])ccedè\\b", ur"\1ccedé"),
(u"\\b([aA])ddissè\\b", ur"\1ddissé"),
(u"\\b([aA])dempiè\\b", ur"\1dempié"),
(u"\\b([aA])nnettè\\b", ur"\1nnetté"),
(u"\\b([aA])ntiprendè\\b", ur"\1ntiprendé"),
(u"\\b([aA])ppartenè\\b", ur"\1ppartené"),
(u"\\b([aA])ppendè\\b", ur"\1ppendé"),
(u"\\b([aA])pprendè\\b", ur"\1pprendé"),
(u"\\b([aA])rrendè\\b", ur"\1rrendé"),
(u"\\b([aA])rrompè\\b", ur"\1rrompé"),
(u"\\b([aA])ssistè\\b", ur"\1ssisté"),
(u"\\b([aA])ssolvè\\b", ur"\1ssolvé"),
(u"\\b([aA])stenè\\b", ur"\1stené"),
(u"\\b([aA])ttenè\\b", ur"\1ttené"),
(u"\\b([aA])vvedè\\b", ur"\1vvedé"),
(u"\\b([bB])attè\\b", ur"\1atté"),
(u"\\b([cC])edè\\b", ur"\1edé"),
(u"\\b([cC])ernè\\b", ur"\1erné"),
(u"\\b([cC])hiedè\\b", ur"\1hiedé"),
(u"\\b([cC])ombattè\\b", ur"\1ombatté"),
(u"\\b([cC])ompetè\\b", ur"\1ompeté"),
(u"\\b([cC])ompiè\\b", ur"\1ompié"),
(u"\\b([cC])omprendè\\b", ur"\1omprendé"),
(u"\\b([cC])oncedè\\b", ur"\1oncedé"),
(u"\\b([cC])oncernè\\b", ur"\1oncerné"),
(u"\\b([cC])onnettè\\b", ur"\1onnetté"),
(u"\\b([cC])onsistè\\b", ur"\1onsisté"),
(u"\\b([cC])ontenè\\b", ur"\1ontené"),
(u"\\b([cC])ontrobattè\\b", ur"\1ontrobatté"),
(u"\\b([cC])onvedè\\b", ur"\1onvedé"),
(u"\\b([cC])onvergè\\b", ur"\1onvergé"),
(u"\\b([cC])onvivè\\b", ur"\1onvivé"),
(u"\\b([cC])orrompè\\b", ur"\1orrompé"),
(u"\\b([cC])redè\\b", ur"\1redé"),
(u"\\b([dD])ecedè\\b", ur"\1ecedé"),
(u"\\b([dD])eflettè\\b", ur"\1efletté"),
(u"\\b([dD])elinquè\\b", ur"\1elinqué"),
(u"\\b([dD])esistè\\b", ur"\1esisté"),
(u"\\b([dD])etenè\\b", ur"\1etené"),
(u"\\b([dD])evolvè\\b", ur"\1evolvé"),
(u"\\b([dD])ibattè\\b", ur"\1ibatté"),
(u"\\b([dD])ipendè\\b", ur"\1ipendé"),
(u"\\b([dD])iprendè\\b", ur"\1iprendé"),
(u"\\b([dD])irimè\\b", ur"\1irimé"),
(u"\\b([dD])iscernè\\b", ur"\1iscerné"),
(u"\\b([dD])isottenè\\b", ur"\1isottené"),
(u"\\b([dD])isperdè\\b", ur"\1isperdé"),
(u"\\b([dD])isplendè\\b", ur"\1isplendé"),
(u"\\b([dD])issolvè\\b", ur"\1issolvé"),
(u"\\b([dD])issovvennè\\b", ur"\1issovvenné"),
(u"\\b([dD])istemè\\b", ur"\1istemé"),
(u"\\b([dD])isvolvè\\b", ur"\1isvolvé"),
(u"\\b([eE])ccedè\\b", ur"\1ccedé"),
(u"\\b([eE])lidè\\b", ur"\1lidé"),
(u"\\b([eE])ludè\\b", ur"\1ludé"),
(u"\\b([eE])rompè\\b", ur"\1rompé"),
(u"\\b([eE])sigè\\b", ur"\1sigé"),
(u"\\b([eE])simè\\b", ur"\1simé"),
(u"\\b([eE])sistè\\b", ur"\1sisté"),
(u"\\b([eE])spandè\\b", ur"\1spandé"),
(u"\\b([eE])stroquè\\b", ur"\1stroqué"),
(u"\\b([eE])volvè\\b", ur"\1volvé"),
(u"\\b([fF])endè\\b", ur"\1endé"),
(u"\\b([fF])ervè\\b", ur"\1ervé"),
(u"\\b([fF])lettè\\b", ur"\1letté"),
(u"\\b([fF])rapprendè\\b", ur"\1rapprendé"),
(u"\\b([fF])remè\\b", ur"\1remé"),
(u"\\b([gG])enuflettè\\b", ur"\1enufletté"),
(u"\\b([iI])mbattè\\b", ur"\1mbatté"),
(u"\\b([iI])mbevè\\b", ur"\1mbevé"),
(u"\\b([iI])mpiè\\b", ur"\1mpié"),
(u"\\b([iI])mprendè\\b", ur"\1mprendé"),
(u"\\b([iI])ncedè\\b", ur"\1ncedé"),
(u"\\b([iI])ncombè\\b", ur"\1ncombé"),
(u"\\b([iI])nfremè\\b", ur"\1nfremé"),
(u"\\b([iI])nsistè\\b", ur"\1nsisté"),
(u"\\b([iI])ntercedè\\b", ur"\1ntercedé"),
(u"\\b([iI])nterprendè\\b", ur"\1nterprendé"),
(u"\\b([iI])nterrompè\\b", ur"\1nterrompé"),
(u"\\b([iI])ntessè\\b", ur"\1ntessé"),
(u"\\b([iI])ntraprendè\\b", ur"\1ntraprendé"),
(u"\\b([iI])ntrarompè\\b", ur"\1ntrarompé"),
(u"\\b([iI])ntratessè\\b", ur"\1ntratessé"),
(u"\\b([iI])ntrattenè\\b", ur"\1ntrattené"),
(u"\\b([iI])ntravedè\\b", ur"\1ntravedé"),
(u"\\b([iI])ntroflettè\\b", ur"\1ntrofletté"),
(u"\\b([iI])rrompè\\b", ur"\1rrompé"),
(u"\\b([mM])antenè\\b", ur"\1antené"),
(u"\\b([mM])ietè\\b", ur"\1ieté"),
(u"\\b([oO])ttenè\\b", ur"\1ttené"),
(u"\\b([pP])endè\\b", ur"\1endé"),
(u"\\b([pP])erdè\\b", ur"\1erdé"),
(u"\\b([pP])ersistè\\b", ur"\1ersisté"),
(u"\\b([pP])iovè\\b", ur"\1iové"),
(u"\\b([pP])ossedè\\b", ur"\1ossedé"),
(u"\\b([pP])otè\\b", ur"\1oté"),
(u"\\b([pP])recedè\\b", ur"\1recedé"),
(u"\\b([pP])reesistè\\b", ur"\1reesisté"),
(u"\\b([pP])remè\\b", ur"\1remé"),
(u"\\b([pP])rendè\\b", ur"\1rendé"),
(u"\\b([pP])rescindè\\b", ur"\1rescindé"),
(u"\\b([pP])resiedè\\b", ur"\1resiedé"),
(u"\\b([pP])revedè\\b", ur"\1revedé"),
(u"\\b([pP])rocedè\\b", ur"\1rocedé"),
(u"\\b([pP])ropendè\\b", ur"\1ropendé"),
(u"\\b([pP])rorompè\\b", ur"\1rorompé"),
(u"\\b([pP])rovolvè\\b", ur"\1rovolvé"),
(u"\\b([rR])apprendè\\b", ur"\1apprendé"),
(u"\\b([rR])attenè\\b", ur"\1attené"),
(u"\\b([rR])avvedè\\b", ur"\1avvedé"),
(u"\\b([rR])ecedè\\b", ur"\1ecedé"),
(u"\\b([rR])edigè\\b", ur"\1edigé"),
(u"\\b([rR])endè\\b", ur"\1endé"),
(u"\\b([rR])esistè\\b", ur"\1esisté"),
(u"\\b([rR])etrocedè\\b", ur"\1etrocedé"),
(u"\\b([rR])iannettè\\b", ur"\1iannetté"),
(u"\\b([rR])ibattè\\b", ur"\1ibatté"),
(u"\\b([rR])icedè\\b", ur"\1icedé"),
(u"\\b([rR])icevè\\b", ur"\1icevé"),
(u"\\b([rR])ichiedè\\b", ur"\1ichiedé"),
(u"\\b([rR])iconnettè\\b", ur"\1iconnetté"),
(u"\\b([rR])iconverrè\\b", ur"\1iconverré"),
(u"\\b([rR])icredè\\b", ur"\1icredé"),
(u"\\b([rR])iedè\\b", ur"\1iedé"),
(u"\\b([rR])iempiè\\b", ur"\1iempié"),
(u"\\b([rR])iflettè\\b", ur"\1ifletté"),
(u"\\b([rR])ingodè\\b", ur"\1ingodé"),
(u"\\b([rR])ipentè\\b", ur"\1ipenté"),
(u"\\b([rR])ipetè\\b", ur"\1ipeté"),
(u"\\b([rR])iprendè\\b", ur"\1iprendé"),
(u"\\b([rR])isedè\\b", ur"\1isedé"),
(u"\\b([rR])isiedè\\b", ur"\1isiedé"),
(u"\\b([rR])isolvè\\b", ur"\1isolvé"),
(u"\\b([rR])isplendè\\b", ur"\1isplendé"),
(u"\\b([rR])itenè\\b", ur"\1itené"),
(u"\\b([rR])ivedè\\b", ur"\1ivedé"),
(u"\\b([rR])ivendè\\b", ur"\1ivendé"),
(u"\\b([rR])ivivè\\b", ur"\1ivivé"),
(u"\\b([rR])ompè\\b", ur"\1ompé"),
(u"\\b([sS])battè\\b", ur"\1batté"),
(u"\\b([sS])candè\\b", ur"\1candé"),
(u"\\b([sS])cernè\\b", ur"\1cerné"),
(u"\\b([sS])connettè\\b", ur"\1connetté"),
(u"\\b([sS])ecernè\\b", ur"\1ecerné"),
(u"\\b([sS])fottè\\b", ur"\1fotté"),
(u"\\b([sS])occombè\\b", ur"\1occombé"),
(u"\\b([sS])oprassedè\\b", ur"\1oprassedé"),
(u"\\b([sS])opravvivè\\b", ur"\1opravvivé"),
(u"\\b([sS])orprendè\\b", ur"\1orprendé"),
(u"\\b([sS])ostenè\\b", ur"\1ostené"),
(u"\\b([sS])pandè\\b", ur"\1pandé"),
(u"\\b([sS])perdè\\b", ur"\1perdé"),
(u"\\b([sS])plendè\\b", ur"\1plendé"),
(u"\\b([sS])premè\\b", ur"\1premé"),
(u"\\b([sS])ternè\\b", ur"\1terné"),
(u"\\b([sS])trafottè\\b", ur"\1trafotté"),
(u"\\b([sS])travedè\\b", ur"\1travedé"),
(u"\\b([sS])tridè\\b", ur"\1tridé"),
(u"\\b([tT])emè\\b", ur"\1emé"),
(u"\\b([tT])enè\\b", ur"\1ené"),
(u"\\b([tT])essè\\b", ur"\1essé"),
(u"\\b([tT])ralucè\\b", ur"\1ralucé"),
(u"\\b([tT])ransigè\\b", ur"\1ransigé"),
(u"\\b([tT])rattenè\\b", ur"\1rattené"),
(u"\\b([tT])ravedè\\b", ur"\1ravedé"),
(u"\\b([vV])edè\\b", ur"\1edé"),
(u"\\b([vV])endè\\b", ur"\1endé"),
(u"\\b([vV])ertè\\b", ur"\1erté"),
#Termini d'origine francese (ed italiani come caffè)
(u'\\b([aA])ntirè\\b', ur'\1ntiré'),
(u'\\b([aA])utodafè\\b', ur'\1utodafé'),
(u'\\b([cC])annetè\\b', ur'\1anneté'),
(u'\\b([cC])apitonnè\\b', ur'\1apitonné'),
(u'\\b([cC])lichè\\b', ur'\1liché'),
(u'\\b([cC])loisonnè\\b', ur'\1loisonné'),
(u'\\b([cC])onsommè\\b', ur'\1onsommé'),
(u"\\b([cC])impanzè\\b", ur"\1impanzé"),
(u'\\b([cC])oupè\\b', ur'\1oupé'),
(u'\\b([cC])raquelè\\b', ur'\1raquelé'),
(u'\\b([dD])ecolletè\\b', ur'\1ecolleté'),
(u'\\b([dD])écolletè\\b', ur'\1écolleté'),
(u'\\b([dD])efilè\\b', ur'\1efilé'),
(u'\\b([dD])éfilè\\b', ur'\1éfilé'),
(u'\\b([dD])egagè\\b', ur'\1egagé'),
(u'\\b([dD])égagè\\b', ur'\1égagé'),
(u'\\b([dD])elavè\\b', ur'\1elavé'),
(u'\\b([dD])élavè\\b', ur'\1élavé'),
(u'\\b([dD])emodè\\b', ur'\1emodé'),
(u'\\b([dD])émodè\\b', ur'\1émodé'),
(u'\\b([dD])eracinè\\b', ur'\1eraciné'),
(u'\\b([dD])éracinè\\b', ur'\1éraciné'),
(u'\\b([dD])eshabillè\\b', ur'\1eshabillé'),
(u'\\b([dD])éshabillè\\b', ur'\1éshabillé'),
(u'\\b([eE])cartè\\b', ur'\1carté'),
(u'\\b([eE])nfant gƒtè\\b', ur'\1nfant gƒté'),
(u'\\b([eE])ngagè\\b', ur'\1ngagé'),
(u'\\b([fF])lambè\\b', ur'\1lambé'),
(u'\\b([fF])oncè\\b', ur'\1oncé'),
(u'\\b([fF])risè\\b', ur'\1risé'),
(u'\\b([gG])aufrè\\b', ur'\1aufré'),
(u'\\b([gG])lacè\\b', ur'\1lacé'),
(u"\\b([gG])ranmercè\\b", ur"\1ranmercé"),
(u'\\b([hH])abituè\\b', ur'\1abitué'),
(u'\\b([hH])ôtel meublè\\b', ur'\1ôtel meublé'),
(u'\\b([iI])mprimè\\b', ur'\1mprimé'),
(u'\\b([iI])nterrè\\b', ur'\1nterré'),
(u'\\b([kK])aritè\\b', ur'\1arité'),
(u'\\b([mM])arron glacè\\b', ur'\1arron glacé'),
(u'\\b([mM])atelassè\\b', ur'\1atelassé'),
(u'\\bmercè\\b', ur'mercé'),
(u'\\b([mM])erzè\\b', ur'\1erzé'),
(u'\\b([mM])eublè\\b', ur'\1eublé'),
(u'\\b([mM])oirè\\b', ur'\1oiré'),
(u'\\b([mM])oulinè\\b', ur'\1ouliné'),
(u'\\b([nN])egligè\\b', ur'\1egligé'),
(u'\\b([nN])égligè\\b', ur'\1égligé'),
(u"\\b([nN])ontiscordardimè\\b", ur"\1ontiscordardimé"),
(u'\\b([pP])ancarrè\\b', ur'\1ancarré'),
(u'\\b([pP])âtè\\b', ur'\1âté'),
(u'\\b([sS])aint-honorè\\b', ur'\1aint-honoré'),
(u'\\b([sS])cimpanz[eè]\\b', ur'\1cimpanzé'),
(u'\\b([sS])eparè\\b', ur'\1eparé'),
(u'\\b([sS])oufflè\\b', ur'\1oufflé'),
(u'\\b([tT])amurè\\b', ur'\1amuré'),
(u'\\b([tT])rentatrè\\b', ur'\1rentatré'),
(u'\\b([tT])ruffè\\b', ur'\1ruffé'),
(u'\\b([vV])arietè\\b', ur'\1arieté'),
(u'\\b([vV])ariétè\\b', ur'\1ariété'),
(u'\\b([vV])icerè\\b', ur'\1iceré'),
(u'\\b([vV])entitrè\\b', ur'\1entitré'),
(u'\\b([aA])himé\\b', ur'\1himè'),
(u'\\b([aA])mmazzacaffé\\b', ur'\1mmazzacaffè'),
(u'\\b([aA])ppié\\b', ur'\1ppiè'),
(u'\\b([bB])igné\\b', ur'\1ignè'),
(u'\\b([bB])uffé\\b', ur'\1uffè'),
(u'\\b([cC])abaré\\b', ur'\1abarè'),
(u'\\b([cC])abriolé\\b', ur'\1abriolè'),
(u'\\b([cC])anapé\\b', ur'\1anapè'),
(u'\\b([cC])arcadé\\b', ur'\1arcadè'),
(u'\\b([cC])hedivé\\b', ur'\1hedivè'),
(u'\\b([cC])ioé\\b', ur'\1ioè'),
(u'\\b([cC])occodé\\b', ur'\1occodè'),
(u'\\b([cC])ontrobuffé\\b', ur'\1ontrobuffè'),
(u'\\b([cC])orvé\\b', ur'\1orvè'),
(u'\\b([cC])roscé\\b', ur'\1roscè'),
(u'\\b([cC])upé\\b', ur'\1upè'),
(u'\\b([dD])appié\\b', ur'\1appiè'),
(u'\\b([dD])osacaffé\\b', ur'\1osacaffè'),
(u'\\b([eE])uhoé\\b', ur'\1uhoè'),
(u'\\b([fF])orfé\\b', ur'\1orfè'),
(u'\\b([kK])arkadé\\b', ur'\1arkadè'),
(u'\\b([kK])edivé\\b', ur'\1edivè'),
(u'\\b([lL])acché\\b', ur'\1acchè'),
(u'\\b([mM])acinacaffé\\b', ur'\1acinacaffè'),
(u'\\b([mM])acramé\\b', ur'\1acramè'),
(u'\\b([mM])ordoré\\b', ur'\1ordorè'),
(u'\\b([mM])usmé\\b', ur'\1usmè'),
(u'\\b([nN])arghilé\\b', ur'\1arghilè'),
(u'\\b([pP])arché\\b', ur'\1archè'),
(u'\\b([pP])uré\\b', ur'\1urè'),
(u'\\b([rR])adiorelé\\b', ur'\1adiorelè'),
(u'\\b([rR])amié\\b', ur'\1amiè'),
(u'\\b([sS])ufflé\\b', ur'\1ufflè'),
(u'\\b([tT])oppé\\b', ur'\1oppè'),
(u'\\b([tT])ostacaffé\\b', ur'\1ostacaffè'),
(u'\\b([tT])uppé\\b', ur'\1uppè'),
(u'\\b([vV])ahiné\\b', ur'\1ahinè'),
(u'\\bGiosué\\b', ur'Giosuè'),
(u'\\bMosé\\b', ur'Mosè'),
# Altre sostituzioni
(u"(?m)(== ?[Ll]Collegamenti Esterni ?==)", ur"== Collegamenti esterni =="),
(u"(?m)(== ?[Ll]ink [Ee]sterni ?==)", ur"== Collegamenti esterni =="),
(u"(?m)(== ?[Vv]edi [Aa]nche ?==)", ur"== Voci correlate =="),
]
},
}
class XmlDumpReplacePageGenerator:
"""
Generator which will yield Pages to pages that might contain text to
replace. These pages will be retrieved from a local XML dump file
(cur table).
"""
def __init__(self, xmlFilename, replacements, exceptions):
"""
Arguments:
* xmlFilename - The dump's path, either absolute or relative
* replacements - A list of 2-tuples of original text (as a compiled
regular expression) and replacement text (as a
string).
* exceptions - A list of compiled regular expression; pages which
contain text that matches one of these won't be
changed.
"""
self.xmlFilename = xmlFilename
self.replacements = replacements
self.exceptions = exceptions
def __iter__(self):
import xmlreader
mysite = wikipedia.getSite()
dump = xmlreader.XmlDump(self.xmlFilename)
for entry in dump.parse():
skip_page = False
for exception in self.exceptions:
if exception.search(entry.text):
skip_page = True
break
if not skip_page:
# TODO: leave out pages that only have old inside nowiki, comments, math
for old, new in self.replacements:
if old.search(entry.text):
yield wikipedia.Page(mysite, entry.title)
break
class ReplaceRobot:
"""
A bot that can do text replacements.
"""
def __init__(self, generator, replacements, exceptions = [], acceptall = False):
"""
Arguments:
* generator - A generator that yields Page objects.
* replacements - A list of 2-tuples of original text (as a compiled
regular expression) and replacement text (as a
string).
* exceptions - A list of compiled regular expression; pages which
contain text that matches one of these won't be
changed.
* acceptall - If True, the user won't be prompted before changes
are made.
"""
self.generator = generator
self.replacements = replacements
self.exceptions = exceptions
self.acceptall = acceptall
def checkExceptions(self, original_text):
"""
If one of the exceptions applies for the given text, returns the
substring which matches the exception. Otherwise it returns None.
"""
for exception in self.exceptions:
hit = exception.search(original_text)
if hit:
return hit.group(0)
return None
def doReplacements(self, original_text):
"""
Returns the text which is generated by applying all replacements to the
given text.
"""
new_text = original_text
for old, new in self.replacements:
new_text = wikipedia.replaceExceptMathNowikiAndComments(new_text, old, new)
return new_text
def run(self):
"""
Starts the robot.
"""
# Run the generator which will yield Pages which might need to be
# changed.
for page in self.generator:
try:
# Load the page's text from the wiki
original_text = page.get()
if not page.canBeEdited():
wikipedia.output(u'Skipping locked page %s' % page.title())
continue
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.title())
continue
except wikipedia.IsRedirectPage:
original_text = page.get(get_redirect=True)
match = self.checkExceptions(original_text)
# skip all pages that contain certain texts
if match:
wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match))
else:
new_text = self.doReplacements(original_text)
if new_text == original_text:
wikipedia.output('No changes were necessary in %s' % page.title())
else:
wikipedia.output(u'\n>>> %s <<<' % page.title())
wikipedia.showDiff(original_text, new_text)
if not self.acceptall:
choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
if choice in ['a', 'A']:
self.acceptall = True
if self.acceptall or choice in ['y', 'Y']:
page.put(new_text)
def prepareRegexForMySQL(pattern):
pattern = pattern.replace('\s', '[:space:]')
pattern = pattern.replace('\d', '[:digit:]')
pattern = pattern.replace('\w', '[:alnum:]')
pattern = pattern.replace("'", "\\" + "'")
#pattern = pattern.replace('\\', '\\\\')
#for char in ['[', ']', "'"]:
# pattern = pattern.replace(char, '\%s' % char)
return pattern
def main():
gen = None
# How we want to retrieve information on which pages need to be changed.
# Can either be 'xmldump', 'textfile' or 'userinput'.
source = None
# Array which will collect commandline parameters.
# First element is original text, second element is replacement text.
commandline_replacements = []
# A list of 2-tuples of original text and replacement text.
replacements = []
# Don't edit pages which contain certain texts.
exceptions = []
# Should the elements of 'replacements' and 'exceptions' be interpreted
# as regular expressions?
regex = False
# Predefined fixes from dictionary 'fixes' (see above).
fix = None
# the dump's path, either absolute or relative, which will be used when source
# is 'xmldump'.
xmlFilename = None
useSql = False
# the textfile's path, either absolute or relative, which will be used when
# source is 'textfile'.
textfilename = None
# the category name which will be used when source is 'category'.
categoryname = None
# pages which will be processed when the -page parameter is used
PageTitles = []
# a page whose referrers will be processed when the -ref parameter is used
referredPageTitle = None
# a page whose links will be processed when the -links parameter is used
linkingPageTitle = None
# will become True when the user presses a ('yes to all') or uses the -always
# commandline paramater.
acceptall = False
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
# Which page to start
startpage = None
# Google query
googleQuery = None
# Load default summary message.
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))
# Read commandline parameters.
for arg in wikipedia.handleArgs():
if arg == '-regex':
regex = True
elif arg.startswith('-file'):
if len(arg) >= 6:
textfilename = arg[6:]
gen = pagegenerators.TextfilePageGenerator(textfilename)
elif arg.startswith('-cat'):
if len(arg) == 4:
categoryname = wikipedia.input(u'Please enter the category name:')
else:
categoryname = arg[5:]
cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
gen = pagegenerators.CategorizedPageGenerator(cat)
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
elif arg =='-sql':
useSql = True
elif arg.startswith('-page'):
if len(arg) == 5:
PageTitles.append(wikipedia.input(u'Which page do you want to chage?'))
else:
PageTitles.append(arg[6:])
source = 'specificPages'
elif arg.startswith('-ref'):
if len(arg) == 4:
referredPageTitle = wikipedia.input(u'Links to which page should be processed?')
else:
referredPageTitle = arg[5:]
referredPage = wikipedia.Page(wikipedia.getSite(), referredPageTitle)
gen = pagegenerators.ReferringPageGenerator(referredPage)
elif arg.startswith('-links'):
if len(arg) == 6:
linkingPageTitle = wikipedia.input(u'Links from which page should be processed?')
else:
linkingPageTitle = arg[7:]
linkingPage = wikipedia.Page(wikipedia.getSite(), linkingPageTitle)
gen = pagegenerators.LinkedPageGenerator(linkingPage)
elif arg.startswith('-start'):
if len(arg) == 6:
firstPageTitle = wikipedia.input(u'Which page do you want to chage?')
else:
firstPageTitle = arg[7:]
namespace = wikipedia.Page(wikipedia.getSite(), firstPageTitle).namespace()
gen = pagegenerators.AllpagesPageGenerator(firstPageTitle, namespace)
elif arg.startswith('-google'):
if len(arg) >= 8:
googleQuery = arg[8:]
gen = pagegenerators.GoogleSearchPageGenerator(googleQuery)
elif arg.startswith('-except:'):
exceptions.append(arg[8:])
elif arg.startswith('-fix:'):
fix = arg[5:]
elif arg == '-always':
acceptall = True
elif arg.startswith('-namespace:'):
namespaces.append(int(arg[11:]))
else:
commandline_replacements.append(arg)
if (len(commandline_replacements) == 2 and fix == None):
replacements.append((commandline_replacements[0], commandline_replacements[1]))
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')')
elif fix == None:
old = wikipedia.input(u'Please enter the text that should be replaced:')
new = wikipedia.input(u'Please enter the new text:')
change = '(-' + old + ' +' + new
replacements.append((old, new))
while True:
old = wikipedia.input(u'Please enter another text that should be replaced, or press Enter to start:')
if old == '':
change = change + ')'
break
new = wikipedia.input(u'Please enter the new text:')
change = change + ' & -' + old + ' +' + new
replacements.append((old, new))
default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) % change
wikipedia.output(u'The summary message will default to: %s' % default_summary_message)
summary_message = wikipedia.input(u'Press Enter to use this default message, or enter a description of the changes your bot will make:')
if summary_message == '':
summary_message = default_summary_message
wikipedia.setAction(summary_message)
else:
# Perform one of the predefined actions.
try:
fix = fixes[fix]
except KeyError:
wikipedia.output(u'Available predefined fixes are: %s' % fixes.keys())
wikipedia.stopme()
sys.exit()
if fix.has_key('regex'):
regex = fix['regex']
if fix.has_key('msg'):
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), fix['msg']))
if fix.has_key('exceptions'):
exceptions = fix['exceptions']
replacements = fix['replacements']
# already compile all regular expressions here to save time later
for i in range(len(replacements)):
old, new = replacements[i]
if not regex:
old = re.escape(old)
oldR = re.compile(old, re.UNICODE)
replacements[i] = oldR, new
for i in range(len(exceptions)):
exception = exceptions[i]
if not regex:
exception = re.escape(exception)
exceptionR = re.compile(exception, re.UNICODE)
exceptions[i] = exceptionR
if xmlFilename:
gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions)
elif useSql:
whereClause = 'WHERE (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements])
if exceptions:
exceptClause = 'AND NOT (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions])
else:
exceptClause = ''
query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
gen = pagegenerators.MySQLPageGenerator(query)
elif PageTitles:
pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
gen = iter(pages)
if not gen:
# syntax error, show help text from the top of this file
wikipedia.output(__doc__, 'utf-8')
wikipedia.stopme()
sys.exit()
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()