Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys, math
import codecs
import xmlreader
import zperevod as perevod
from zbib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def zapiszam(f1,ti,fperv,ts,tn):
if fperv:
b=u'|+ kategorio\n%s\n'%ti
f1.write(b)
f1.flush()
if ts!=u'':
ts+=u'\n'
b=((u'?=======\n1ksu\n-=======\n%s+=======\n%s\n')%(ts,tn))
f1.write(b)
f1.flush()
return 0
#def vich_cslov(t,tsl,min,max):
def cslov(t):
t = wikipedia.removeLanguageLinks(t)
t = wikipedia.removeCategoryLinks(t, mysite)
t = ubrkoment (t)
for c in u'`~!@#$%^&*()_+|-=\\{};\':",./<>?\n\t\r':
t=t.replace(c,u' ')
for c in u'[]':
t=t.replace(c,u'')
s=t.split(u' ')
# tb={}
r=[]
n=1
for a in s:
if len(a)<3:
continue
flne=0
for c in u'0123456789':
if c in a:
flne=1
break
if flne:
continue
# a=perv_upper(a.lower())
a=a.lower()
if a.endswith(u'n'):
a=a[:len(a)-1]
if a.endswith(u'j'):
a=a[:len(a)-1]
if a.endswith(u'a'):
a=a[:len(a)-1]+u'o'
if len(a)<3:
continue
# if tsl!=None:
# no=tsl[a]
# if no<min or no>max:
# continue
# if tb.has_key(a):
# tb[a]+=1
# else:
# tb[a]=1
r.append(a)
n+=1
# return (tb,n)
return a
def tkateg(t):
# pl=wikipedia.Page(mysite,u'a')
# pl._contents=t
sk=wikipedia.getCategoryLinks(t,mysite)
r=[]
for k in sk:
n=k.title()
dop=1
for c in u'0123456789':
if c in n:
dop=0
break
if dop:
r.append(k.title())
return (r,len(sk))
def provsta(n,t):
redirR = mysite.redirectRegex()
m = redirR.search(t)
if m:
return 0
# if n.startswith(u'Vikipedio:'):
if u':' in n:
return 0
if (u'{{apartigilo}}' in t) or (u'{{Apartigilo}}' in t):
return 0
return 1
def spisvstr(s,ra):
t=u''
fp=0
for a in s:
if fp:
t+=ra
t+=a
fp=1
return t
def main(jaz,fvh,fvih,fipref):
tblkat={}
tblkatc={}
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
tblkat[s]=1
c=cslov(s)
s1=spisvstr(c,u' ')
tblkatc[s1]=s
fpref.close()
f1=codecs.open(fvih, 'w', 'utf-8')
# insite=wikipedia.getSite(jaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
readPagesCount = 0
maxpages=300000
tkatuz={}
nobst=0
nnekst=0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
if readPagesCount > maxpages:
break
n=entry.title
t=entry.text
if not provsta(n,t):
continue
nobst+=1
(tkat,nkat)=tkateg(t)
if len(tkat)>0:
# vivod(u'katuz %s\n'%n)
tkatuz[n.lower()]=tkat
if nkat==0:
nnekst+=1
vivod(u'\n\n\nnobst=%d nnekst=%d\n\n\n\n'%(nobst,nnekst))
# dump = xmlreader.XmlDump(fvh)
readPagesCount = 0
sch_str=0
sch_zam=0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
if readPagesCount > maxpages:
break
n=entry.title
t=entry.text
if not provsta(n,t):
continue
(tkat,nkat)=tkateg(t)
if nkat==0:
# vivod(u'nekat %s\n'%n)
# ttk=( n.lower().replace(u')',u' ').replace(u'(',u' ').
# replace(u' ',u' ') )
# tts=ttk.split(u' ')
tts=cslov(n)
s1=spisvstr(c,u' ')
# if len(tts)<=1:
# continue
m=3
while m>0:
i=len(tts)-m
while i>=0:
ts=spisvstr(cslov(tts[i:i+m]),u' ')
vivod(u' prob %s\n'%ts)
if tblkatc.has_key(ts):
vivod(u'+ %s %s %s\n'%(n,ts,tblkatc[ts]))
# break
i-=1
m+=1
for i in range(len(tts)):
if i<len(tts)-2:
ts=tts[i]+u' '+tts[i+1]+u' '+tts[i+2]
# vivod(u' prob %s\n'%ts)
if tkatuz.has_key(ts):
vivod(u'+ %s %s %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
break
if i<len(tts)-1:
ts=tts[i]+u' '+tts[i+1]
# vivod(u' prob %s\n'%ts)
if tkatuz.has_key(ts):
vivod(u'+ %s %s %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
break
ts=tts[i]
# vivod(u' prob %s\n'%ts)
if tkatuz.has_key(ts):
vivod(u'+ %s %s %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
break
# if fperv==0:
# sch_str+=1
# f1.write(u'========================\n')
# f1.flush()
# otch2.write(u'%s + %s\n%s\n========================\n'%(entry.title,kat,entry.text))
# otch2.flush()
# vivod(u'sch_str=%d sch_zam=%d\n'%(sch_str,sch_zam))
jaz = u'eo'
#jaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fvih.endswith(u'.txt'):
fvih=fvih[:len(fvih)-4]
fotch = fvih+u'_ot.txt'
fotch2 = fvih+u'_op.txt'
fvih+=u'.txt'
#fprefim = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fprefvse = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
fpref = u'eo_ka_sp.txt'
#fotch = u'mm_otch.txt'
#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)
try:
main(jaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>