Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def korr_im (jaz,t,pref):
for p in pref:
if t.startswith(p):
t=t[len(p):]
break
t=perv_upper(t)
if u':' in t:
if ( (not t.startswith(u'Category:')) and
(not t.startswith(u'Image:')) ):
katprefi=mysite.family.category_namespace(jaz)
if not t.startswith(katprefi+u':'):
izprefi=mysite.family.image_namespace(jaz)
if not t.startswith(izprefi+u':'):
return u''
return t
def main(vhjaz,fvh,fvih,fipref):
tzfl=0
tzst={}
tzno={}
pref=[]
if fipref!=u'':
fpref=codecs.open(fipref,'rb',encoding='utf-8')
for s in fpref.readlines():
if ord(s[0]) == 65279:
s=s[1:]
s=s.replace(u'\r',u'')
if s[len(s)-1]==u'\n':
s=s[:len(s)-1]
pref.append(s)
fpref.close()
n=u''
# f0=codecs.open(fvh,'rb',encoding='utf-8')
f1=codecs.open(fvih, 'w', 'utf-8')
insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
tblredir = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
redirR = insite.redirectRegex()
readPagesCount = 0
for entry in dump.parse():
readPagesCount += 1
# always print status message after 10000 pages
if readPagesCount % 10000 == 0:
print '%i pages read...' % readPagesCount
m = redirR.search(entry.text)
if m:
target = m.group(1)
# There might be redirects to another wiki. Ignore these.
for code in wikipedia.getSite().family.langs.keys():
if target.startswith('%s:' % code) or target.startswith(':%s:' % code):
wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code))
target = None
break
# if the redirect does not link to another wiki
if target:
target = target.replace('_', ' ')
# remove leading and trailing whitespace
target = target.strip()
# capitalize the first letter
if not wikipedia.getSite().nocapitalize:
target = target[0].upper() + target[1:]
if '#' in target:
target = target[:target.index('#')]
if '|' in target:
wikipedia.output(u'HINT: %s is a redirect with a pipelink.' % entry.title)
target = target[:target.index('|')]
# tblredir[entry.title] = target
# b=u'%s|%s\n' % (entry.title, target)
tt=entry.title
tc=target
tts=tt.split(u' ')
tcs=tc.split(u' ')
ltt=len(tts)
ltc=len(tcs)
if ltt==ltc and ltt>=2 and tt.lower()==tc.lower():
dop=1
for i in range(ltt):
if tts[i]!=tcs[i]:
if len(tts[i])!=len(tcs[i]):
dop=0
for j in range(len(tts[i])):
if (tts[i][j]==tts[i][j].lower() and
tcs[i][j]!=tcs[i][j].lower()):
dop=0
# for j in range(len(tcs[i])):
# if j>0 and tcs[i][j]!=tcs[i][j].lower():
# dop=0
b=u'%d|%s|%s\n' % (dop,tt, tc)
f1.write(b)
f1.flush()
vhjaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fpref=u''
if len(sys.argv)>=5:
fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
#mysite = wikipedia.getSite()
#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite()
try:
main(vhjaz,fvh,fvih,fpref)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>