Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, math
import codecs
import xmlreader
from zbib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def vivod2(b):
wikipedia.output(b)
otch2.write(b)
otch2.flush()
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def tkateg(t):
# pl=wikipedia.Page(mysite,u'a')
# pl._contents=t
sk=wikipedia.getCategoryLinks(t,mysite)
r=[]
for k in sk:
n=k.title()
dop=1
for c in u'0123456789':
if c in n:
dop=0
break
if dop:
r.append(n)
return (r,len(sk))
def n_ivi(t):
pl=wikipedia.Page(mysite,u'a')
pl._contents=t
oiwi = pl.interwiki()
n=0
for pl2 in oiwi:
if pl2.site()!=mysite:
n+=1
return n
def tipsta(n,t):
redirR = mysite.redirectRegex()
# m = redirR.search(t)
m = redirR.match(t)
if m:
return u'ali'
katprefi=mysite.family.category_namespace(mysite.lang)
if (u':' in n):
if (n.startswith(katprefi+u':')):
return u'kat'
else:
return u'akc'
# if n.startswith(u'Vikipedio:'):
if (u'{{apartigilo}}' in t) or (u'{{Apartigilo}}' in t):
return u'apa'
return u'art'
class Stat:
def __init__(self):
self.o = 0
self.nk = 0
self.pk = 0
self.ni = 0
self.pi = 0
self.t=[0,0,0,0]
def dobav(self,kat,ivi):
if kat!=0:
kat=1
if ivi!=0:
ivi=1
self.o+=1
if kat:
self.pk+=1
else:
self.nk+=1
if ivi:
self.pi+=1
else:
self.ni+=1
i=kat*2+ivi
self.t[i]+=1
def pech(self):
b=(u'o %7d\n'+
u'nk %7d\n'+
u'pk %7d\n'+
u'ni %7d\n'+
u'pi %7d\n'+
u'nkni %7d\n'+
u'nkpi %7d\n'+
u'pkni %7d\n'+
u'pkpi %7d\n')%(self.o,self.nk,self.pk,self.ni,self.pi,
self.t[0],self.t[1],self.t[2],self.t[3])
return b
def main(jaz,fvh):
# insite=wikipedia.getSite(jaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
readPagesCount = 0
maxpages=300000000
# derkat=derevokat(dump,tblkat)
# vivod2(u'****\n')
nob=0
stat={}
stat[u'akc']=Stat()
stat[u'ali']=Stat()
stat[u'apa']=Stat()
stat[u'art']=Stat()
stat[u'kat']=Stat()
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
if readPagesCount > maxpages:
break
n=entry.title
t=entry.text
tip=tipsta(n,t)
nob+=1
(tkat,nkat)=tkateg(t)
nivi=n_ivi(t)
if tip==u'ali' and (nkat!=0 or nivi!=0):
vivod2(u'%s\n%s\n========================\n'%(entry.title,entry.text))
stat[tip].dobav(nkat,nivi)
# vivod(u'\n\n\n****\n\n')
vivod(u'\n\n\n\n\nnob=%d\n\n\n\n'%nob)
for tip in [u'akc',u'ali',u'apa',u'art',u'kat']:
vivod(u' %s\n%s\n'%(tip,stat[tip].pech()))
jaz = u'eo'
#jaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fvih.endswith(u'.txt'):
fvih=fvih[:len(fvih)-4]
fotch = fvih+u'.txt'
fotch2 = fvih+u'_op.txt'
#fvih+=u'.txt'
#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)
try:
main(jaz,fvh)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>