Материал из Википедии — свободной энциклопедии
# -*- coding: UTF-8 -*-
__version__='$Id:'
import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *
def ubrkoment (text):
# Ignore tekst within nowiki tags and HTML comments
nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
match = nowikiOrHtmlCommentR.search(text)
while match:
text = text[:match.start()] + text[match.end():]
match = nowikiOrHtmlCommentR.search(text)
return text
def main(fvh,fvih):
f1=codecs.open(fvih, 'w', 'utf-8')
gmin=1401
gmax=2020
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(fvh)
readPagesCount = 0
tg={}
for entry in dump.parse():
readPagesCount += 1
# always print status message after 10000 pages
if readPagesCount % 10000 == 0:
print '%i pages read...' % readPagesCount
n = entry.title
otch.write(n+u'\n')
otch.flush()
if ( len(n)>4 and not entry.text.startswith(u'#REDIRECT') and
not entry.text.startswith(u'#redirect') and
not u':' in n and
not u'Eŭrovido-Kantokonkurso' in n ):
i=0
while i<len(n)-3:
if (n[i:i+4].isdigit() and (i==0 or not n[i-1].isdigit()) and
(i==len(n)-4 or not n[i+4].isdigit())):
g=int(n[i:i+4])
if g>=gmin and g<=gmax:
# b=((u'%s\n?=======\n1\n-=======\n+======='+
# u'\n\n[[Kategorio:%d]]\n'+
# u'============\n')%(n,g))
# f1.write(b)
# f1.flush()
if tg.has_key(g):
tg[g].append(n)
else:
tg[g]=[n]
i+=1
g=gmin
while g<=gmax:
if tg.has_key(g):
for n in tg[g]:
f1.write(n+u'\n')
f1.flush()
b=((u'?=======\n1\n-=======\n[[Kategorio:%d]]\n+=======\n'+
u'?=======\n1\n-=======\n+======='+
u'\n[[Kategorio:%d]]\n'+
u'============\n')%(g,g))
f1.write(b)
f1.flush()
g+=1
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fotch = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
try:
main(fvh,fvih)
except:
wikipedia.stopme()
raise
else:
wikipedia.stopme()
</nowiki>