Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def zapiszam(f1,ti,fperv,ts,tn,kom,rezx):
if fperv:
b=u''
if kom!=u'':
b=u'|%s\n'%kom
b+=u'%s\n'%ti
f1.write(b)
f1.flush()
if ts!=u'':
ts+=u'\n'
b=((u'?=======\n1%s\n-=======\n%s+=======\n%s\n')%(rezx,ts,tn))
f1.write(b)
f1.flush()
return 0
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
tblz=[]
pref=[]
if 1:
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
st=s.split(u' ')
if len(st)>=3 and st[0].startswith(u'1'):
kom=u''
if len(st)>=4:
kom=st[3]
tblz.append((st[0],st[1],st[2],kom))
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = insite.redirectRegex()
readPagesCount = 0
sch_str=0
sch_zam=0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 1000 pages
if readPagesCount % 1000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if m:
pass
else:
# if entry.title.startswith(u'Vikipedio:'):
# continue
tblz1=[]
tkom=[]
for r, tt, tc, kom in tblz:
# if ( (not entry.title.startswith(tt)) and
# (not entry.title.startswith(tc)) and (tt in entry.text) ):
if (u'p' in r) and (u':' in entry.title):
continue
if (tt in entry.text):
tblz1.append((tt,tc))
if kom!=u'' and (not kom in tkom):
tkom.append(kom)
okom=spisvstr(tkom,u', ')
fperv=1
for tt, tc in tblz1:
fperv=zapiszam(f1,entry.title,fperv,tt,tc,okom,u'')
sch_zam+=1
if fperv==0:
sch_str+=1
f1.write(u'========================\n')
f1.flush()
wikipedia.output(u'sch_str=%d sch_zam=%d'%(sch_str,sch_zam))
jaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fi = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fi.endswith(u'.txt'):
fi=fi[:len(fi)-4]
fotch = fi+u'_ot.txt'
fpref = fi+u'.txt'
fvih = fi+u'_gz.txt'
#fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)
try:
main(jaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>