Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def vivod(b):
wikipedia.output(b)
otch.write(b)
otch.flush()
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def provrimcif(t):
simv=[u'I',u'V',u'X',u'L',u'C',u'D','M']
if len(t)>10:
return 0
for c in t:
if (not c in simv):
return 0
return 1
mindl=6
mindl_sl=4
mindl_dobavsl=5
prsimv=[u' ',u'.',u"'",u'-',u'(',u')']
def provzam(tblsl,ss,flnesks,dopkor):
if ( len(ss)==0 or (u':' in ss) or
(flnesks and (not u' ' in ubr_nk_prob(ss))) ):
return u''
for j in range(len(ss)):
c=ss[j]
if ( (not c in prsimv) and (not c.isalpha()) ):
return u''
dop=dopkor
dop2=0
dopim=0
im=u''
sr=ss
j=0
while j<len(ss):
while j<len(ss) and (ss[j] in prsimv):
j+=1
if j>=len(ss):
break
k=j
while k<len(ss) and (not ss[k] in prsimv):
k+=1
if tblsl.has_key(ss[j:k]):
# vivod(u'tblsl %s\n'%ss[j:k])
im=ss[j:k]
dopim=1
if ( k-j>=2 and ss[j+1:k]==ss[j+1:k].upper() ):
if provrimcif(ss[j:k]):
vivod(u'rimcif %s %s\n'%(ss[j:k],ss))
else:
sr=sr[:j+1]+sr[j+1:k].lower()+sr[k:]
if k-j>=mindl:
dop=1
# if k-j>=mindl_sl and tblsl.has_key(ss[j:k]):
# vivod(u'tblsl %s\n'%ss[j:k])
# dop=1
if k-j>=mindl_sl:
dop2=1
j=k
if dop==0 and dop2 and dopim:
vivod(u'tblsl %s %s\n'%(im,ss))
dop=1
if dop==0:
return u''
if sr==ss:
if not dopkor:
vivod(u'!!! %s\n'%ss)
return u''
return sr
def zapiszam(f1,ti,fperv,ts,tn):
if fperv:
b=u'|malmajuskligo\n%s\n'%ti
f1.write(b)
f1.flush()
b=((u'?=======\n1\n-=======\n%s\n+=======\n%s\n')%(ts,tn))
f1.write(b)
f1.flush()
return 0
def dobavsl(tblsl,s):
s=s.replace(u"'",u' ').replace(u".",u' ').replace(u"-",u' ')
st=s.split(u' ')
for t in st:
if (u'(' in t) or (u')' in t):
break
if len(t)>=mindl_dobavsl:
tblsl[t]=1
def main(vhjaz,fvh,fvih,fiprefim,fiprefvse,fiprefimali):
tblsl={}
tblim={}
fpref=codecs.open(fiprefim,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
tblim[s]=1
dobavsl(tblsl,s)
fpref.close()
tblvse={}
fpref=codecs.open(fiprefvse,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
tblvse[s]=1
fpref.close()
pref=[]
# tblimali={}
if 1:
fpref=codecs.open(fiprefimali,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=ubr_nk_prob(s)
st=s.split(u'|')
if len(st)==3 and st[0]==u'1':
# tblimali[st[1]]=st[2]
tblim[st[1]]=st[2]
dobavsl(tblsl,st[1])
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = insite.redirectRegex()
readPagesCount = 0
sch_str=0
sch_zam=0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if m:
pass
else:
if entry.title.startswith(u'Vikipedio:'):
continue
fperv=1
t=entry.text
p=0
while p<len(t):
p=iskats(t,p,u'[[')
if p==-1:
break
p1=p+2
# p2=iskkonpodp(t,p1,u'[[',u']]')
tkon=[u'[[',u'|',u']]']
(p3,i)=iskats_mn(t,p1,tkon)
if p3==-1:
break
p4=p3+len(tkon[i])
fl1=0
ss=t[p1:p3]
ss1=perv_upper(ubr_nk_prob(ss.replace(u'_',u' ')))
if not tblvse.has_key(ss1):
sr=provzam(tblsl,ss,1,0)
if sr!=u'':
fperv=zapiszam(f1,entry.title,fperv,t[p:p4],
t[p:p1]+sr+t[p3:p4])
sch_zam+=1
fl1=1
if i==1 and (fl1 or tblim.has_key(ss1)):
(p5,i5)=iskats_mn(t,p4,tkon)
if p5!=-1 and i5==2:
p6=p5+len(tkon[i5])
ss=t[p4:p5]
sr=provzam(tblsl,ss,0,1)
if sr!=u'':
fperv=zapiszam(f1,entry.title,fperv,t[p3:p6],
t[p3:p4]+sr+t[p5:p6])
vivod(u'z2 %d %s %s\n'%(fl1,ss,sr))
sch_zam+=1
p=p6
elif p5!=-1:
p=p5
else:
p=p3
else:
p=p3
if fperv==0:
sch_str+=1
f1.write(u'========================\n')
f1.flush()
vivod(u'sch_str=%d sch_zam=%d\n'%(sch_str,sch_zam))
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
#fprefim = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fprefvse = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
fprefim = u'mm_tim.txt'
fprefvse = u'mm_tvse.txt'
fprefimali = u'mm_timali.txt'
fotch = u'mm_otch.txt'
#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fprefim,fprefvse,fprefimali)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>