Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, math
import codecs
import xmlreader
import perevod
from zbib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
iskdl=200
def zapiszam(f1,ti,fperv,ts,tn):
if fperv:
b=u'|+ kategorio\n%s\n'%ti
f1.write(b)
f1.flush()
if ts!=u'':
ts+=u'\n'
b=((u'?=======\n1ksu\n-=======\n%s+=======\n%s\n')%(ts,tn))
f1.write(b)
f1.flush()
return 0
def dobavsl(tblsl,s):
s=s.replace(u"'",u' ').replace(u".",u' ').replace(u"-",u' ')
st=s.split(u' ')
for t in st:
if (u'(' in t) or (u')' in t):
break
if len(t)>=mindl_dobavsl:
tblsl[t]=1
def vich_cslov(t):
t = wikipedia.removeLanguageLinks(t)
t = wikipedia.removeCategoryLinks(t, mysite)
t = ubrkoment (t)
for c in u'`~!@#$%^&*()_+|-=\\[]{};\':",./<>?\n\t\r':
t=t.replace(c,u' ')
s=t.split(u' ')
tb={}
n=1
for a in s:
if len(a)<4:
continue
a=perv_upper(a.lower())
if a.endswith(u'n'):
a=a[:len(a)-1]
if a.endswith(u'j'):
a=a[:len(a)-1]
if a.endswith(u'a'):
a=a[:len(a)-1]+u'o'
if len(a)<4:
continue
if tb.has_key(a):
tb[a]+=1
else:
tb[a]=1
n+=1
return (tb,n)
def tkateg(t):
# pl=wikipedia.Page(mysite,u'a')
# pl._contents=t
sk=wikipedia.getCategoryLinks(t,mysite)
r=[]
for k in sk:
n=k.title()
dop=1
for c in u'0123456789':
if c in n:
dop=0
break
if dop:
r.append(k.title())
return (r,len(sk))
def main(jaz,fvh,fvih,fipref):
tblkat={}
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
tblkat[s]=1
fpref.close()
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(jaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
redirR = insite.redirectRegex()
dump = xmlreader.XmlDump(fvh)
readPagesCount = 0
sch_str=0
sch_zam=0
kcslt={}
kcsln={}
kcslnsl={}
nekat=[]
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
# if readPagesCount > 300:
# break
m = redirR.search(entry.text)
if m:
pass
else:
# if entry.title.startswith(u'Vikipedio:'):
if u':' in entry.title:
continue
fperv=1
t=entry.text
(tkat,nkat)=tkateg(t)
(cslt,csln)=vich_cslov(t)
for kat in tkat+[u'']:
if not kcslt.has_key(kat):
kcslt[kat]={}
kcsln[kat]=0
kcslnsl[kat]=0
kc=kcslt[kat]
kcsln[kat]+=1
kcslnsl[kat]+=csln
for s, n in cslt.iteritems():
z=(n+0.0)/csln
(kcz,kcz2,kcn,kcnsl)=(0,0,0,0)
if kc.has_key(s):
(kcz,kcz2,kcn,kcnsl)=kc[s]
kcz+=z
kcz2+=z*z
kcn+=1
kcnsl+=n
kc[s]=(kcz,kcz2,kcn,kcnsl)
if nkat==0:
nekat.append((entry.title,cslt,csln))
for kat, kc in kcslt.iteritems():
kcno=kcsln[kat]
vivod(u'==== %s %d %d ====\n'%(kat,kcno,kcslnsl[kat]))
for s, zz in kc.iteritems():
(kcz,kcz2,kcn,kcnsl)=kc[s]
if kat==u'':
vivod(u' %s %f %f %d %d %d\n'%(s,kcz,kcz2,kcno,kcn,kcnsl))
kcz/=kcno
kcz2/=kcno
kcz2=math.sqrt(kcz2-kcz*kcz)
kcz2+=kcz/math.sqrt(kcnsl)
if kat==u'':
vivod(u' %s %f %f\n'%(s,kcz,kcz2))
kc[s]=(kcz,kcz2)
kc0=kcslt[u'']
for (tit,cslt,csln) in nekat:
# vivod(u'==tit== %s %d ====\n'%(tit,csln))
metrl=100000000
katl=u''
metrl2=100000000
katl2=u''
for kat, kc in kcslt.iteritems():
if kat==u'':
continue
kcno=kcsln[kat]
kcnosl=kcslnsl[kat]
# vivod(u' == %s %d %d ==\n'%(kat,kcno,kcnosl))
metr=0
for s, n in cslt.iteritems():
z=(n+0.0)/csln
# (kc0z,kc0z2)=kc0[s]
if kc.has_key(s):
(kcz,kcz2)=kc[s]
else:
kcz=0
# kcz2=1/math.sqrt(kcnosl)
# f0=(z-kc0z)/kc0z2
# f0=math.exp(-f0*f0)
# f=(z-kcz)/kcz2
# f=math.exp(-f*f)
# vivod(u' %s %f %f %f %f %f %f %f\n'%(s,z,kc0z,kc0z2,f0,kcz,kcz2,f))
metr+=(z-kcz)*(z-kcz)
for s, (kcz,kcz2) in kc.iteritems():
if not cslt.has_key(s):
metr+=kcz*kcz
metr=math.sqrt(metr)
# vivod(u' %s %f\n'%(kat,metr))
if metr<metrl:
metrl2=metrl
katl2=katl
metrl=metr
katl=kat
elif metr<metrl2:
metrl2=metr
katl2=kat
vivod(u'%s + %s %f %s %f\n'%(tit,katl,metrl,katl2,metrl2))
# break
# if fperv==0:
# sch_str+=1
# f1.write(u'========================\n')
# f1.flush()
# otch2.write(u'%s + %s\n%s\n========================\n'%(entry.title,kat,entry.text))
# otch2.flush()
# vivod(u'sch_str=%d sch_zam=%d\n'%(sch_str,sch_zam))
jaz = u'eo'
#jaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fvih.endswith(u'.txt'):
fvih=fvih[:len(fvih)-4]
fotch = fvih+u'_ot.txt'
fotch2 = fvih+u'_op.txt'
fvih+=u'.txt'
#fprefim = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fprefvse = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
fpref = u'eo_ka_sp.txt'
#fotch = u'mm_otch.txt'
#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)
try:
main(jaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>