Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (mysite,jaz,t,pref,ftkat):
t=perv_upper(ubr_nk_prob(t))
for p in pref:
if t.startswith(p):
t=t[len(p):]
t=perv_upper(ubr_nk_prob(t))
break
ns=0
if ftkat:
if (not t.startswith(u'Category:')):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
return (u'',0)
ns=14
else:
if u':' in t:
# if ( (not t.startswith(u'Category:')) and
# (not t.startswith(u'Image:')) ):
# katprefi=mysite.family.category_namespace(jaz)
# if not t.startswith(katprefi+u':'):
# izprefi=mysite.family.image_namespace(jaz)
# if not t.startswith(izprefi+u':'):
# shprefi=mysite.family.template_namespace(jaz)
# if not t.startswith(shprefi+u':'):
# return u''
katprefi=mysite.family.category_namespace(jaz)
izprefi=mysite.family.image_namespace(jaz)
shprefi=mysite.family.template_namespace(jaz)
katprefi0=u'Category:'
izprefi0=u'Image:'
shprefi0=u'Template:'
if t.startswith(katprefi+u':'):
ns=14
elif t.startswith(katprefi0):
ns=14
t=katprefi+u':'+t[len(katprefi0):]
elif t.startswith(izprefi+u':'):
ns=6
elif t.startswith(izprefi0):
ns=6
t=izprefi+u':'+t[len(izprefi0):]
elif t.startswith(shprefi+u':'):
ns=10
elif t.startswith(shprefi0):
ns=10
t=shprefi+u':'+t[len(shprefi0):]
else:
return (u'',0)
return (t,ns)
def main(fvh,fvih,fipref,ftkat):
tzfl=0
tzst={}
tzno={}
pref=[]
if fipref!=u'':
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
pref.append(s)
fpref.close()
n=u''
f0=codecs.open(fvh,'rb')
zag=f0.read(1000)
f0.close()
jzag='xml:lang="'
p=iskats(zag,0,jzag)
if p==-1:
print '!!! net \'xml:lang=""\''
return
p+=len(jzag)
p1=iskats(zag,p,'"')
if p1==-1:
print '!!! net \'xml:lang=""\''
return
vhjaz=zag[p:p1]
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = insite.redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 10000 pages
if readPagesCount % 10000 == 0:
print '%i pages read...' % readPagesCount
# m = redirR.search(entry.text)
m = redirR.match(entry.text)
if m:
pass
else:
(b0,ns0)=korr_im(insite,vhjaz,entry.title,pref,ftkat)
if b0==u'':
continue
te=ubrkoment(entry.text)
fperv=1
interwikiR = re.compile(r'\[\[([a-z\-]+)\s?:([^\[\]\n]*)\]\]')
for lang, pagetitle in interwikiR.findall(te):
# Check if it really is in fact an interwiki link to a known
# language, or if it's e.g. a category tag or an internal link
if lang in insite.family.langs:
if '|' in pagetitle:
# ignore text after the pipe
pagetitle = pagetitle[:pagetitle.index('|')]
(b1,ns1)=korr_im (lang, pagetitle,[],ftkat)
if b1==u'' or ns0!=ns1:
continue
if fperv:
b=u'%s:%s\n' % (vhjaz,b0)
f1.write(b)
fperv=0
b=u'%s:%s\n' % (lang, b1)
f1.write(b)
if fperv==0:
f1.write(u'\n')
f1.flush()
fvh = sys.argv[1]
fvih = sys.argv[2]
fpref=u''
ftkat=0
if len(sys.argv)>=4:
fpref = sys.argv[3]
if fpref==u'k':
fpref=u''
ftkat=1
try:
main(fvh,fvih,fpref,ftkat)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>