Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
# tr=u''
# while text!=u'':
# match = nowikiOrHtmlCommentR.search(text)
# if not match:
# tr+=text
# break
# tr += text[:match.start()]
# text = text[match.end():]
# return tr
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
# vivod(u' %d %d\n'%(match.start(),match.end()))
match = nowikiOrHtmlCommentR.search(text)
return text
def xml_jaz(fvh):
f0=codecs.open(fvh,'rb')
zag=f0.read(1000)
f0.close()
jzag='xml:lang="'
p=iskats(zag,0,jzag)
if p==-1:
print '!!! net \'xml:lang=""\''
return u''
p+=len(jzag)
p1=iskats(zag,p,'"')
if p1==-1:
print '!!! net \'xml:lang=""\''
return u''
return zag[p:p1]
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
iskkat=[
u'[[Kategorio:Naskiĝ',
# u'[[kategorio:Naskiĝ',
u'[[Kategorio:naskiĝ',
# u'[[kategorio:naskiĝ',
u'[[Kategorio:Mort',
# u'[[kategorio:Mort',
u'[[Kategorio:mort',
# u'[[kategorio:mort',
]
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
pref=[]
if 1:
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
if len(s)>0:
pref.append(perv_upper(s))
fpref.close()
spk=[]
spr=[]
for k in pref:
k=perv_upper(k)
i=iskat(k,u':')
if i==-1:
vivod(u'!!! %s\n'%k)
continue
k0=k[:i+1]+perv_upper(k[i+1:])
k1=u'Category'+k0[i:]
k2=k[:i+1]+perv_lower(k[i+1:])
k3=u'Category'+k2[i:]
spk.append([k0,k1,k2,k3])
spr.append([])
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
katprefi=insite.family.category_namespace(vhjaz)
kp0=katprefi[0]
# tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = insite.redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# if readPagesCount>=5000:
# break
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if (not m): # and (not u':' in entry.title):
t=entry.text
t=ubrkoment(t)
t=t.replace(u'[[ ',u'[[').replace(u': ',u':')
t=t.replace(u'_',u' ')
t=t.replace(u'[[c',u'[[C')
t=t.replace(u'[['+kp0.lower(),u'[['+kp0.upper())
i=0
while i<len(spk):
dop=0
for isk in spk[i]:
if isk in t:
dop=1
break
if dop:
spr[i].append(entry.title)
i+=1
i=0
while i<len(spk):
b=u'# %s\n' % spk[i][0]
f1.write(b)
f1.flush()
for isk in spr[i]:
b=u'%s\n' % isk
f1.write(b)
f1.flush()
i+=1
fvh = sys.argv[1]
fpref = sys.argv[2]
f=fpref
if f.endswith(u'.txt'):
f=f[:len(f)-4]
fvih = f+u'_re.txt'
fotch = f+u'_ot.txt'
vhjaz=xml_jaz(fvh)
if vhjaz!=u'':
otch = codecs.open(fotch, 'w', 'utf-8')
main(vhjaz,fvh,fvih,fpref)
</nowiki>