Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
iskkat=[
u'[[Kategorio:Naskiĝ',
# u'[[kategorio:Naskiĝ',
u'[[Kategorio:naskiĝ',
# u'[[kategorio:naskiĝ',
u'[[Kategorio:Mort',
# u'[[kategorio:Mort',
u'[[Kategorio:mort',
# u'[[kategorio:mort',
]
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
pref=[]
if fipref!=u'':
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
pref.append(s)
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = insite.redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if (not m) and (not u':' in entry.title):
t=entry.text.replace(u'[[ ',u'[[').replace(u': ',u':')
t=t.replace(u'[[k',u'[[K')
dop=0
for isk in iskkat:
if isk in t:
dop=1
break
if dop:
b=u'%s\n' % entry.title
f1.write(b)
f1.flush()
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref=u''
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>