Участник:Maksim-e/zjar isk.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *



def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def main(fvh,fvih):
    f1=codecs.open(fvih, 'w', 'utf-8')
    gmin=1401
    gmax=2020

    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    readPagesCount = 0
    tg={} 

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 10000 pages
        if readPagesCount % 10000 == 0:
            print '%i pages read...' % readPagesCount
        n = entry.title
        otch.write(n+u'\n')
        otch.flush()
        if ( len(n)>4 and not entry.text.startswith(u'#REDIRECT') and
                          not entry.text.startswith(u'#redirect') and
                          not u':' in n and
                          not u'Eŭrovido-Kantokonkurso' in n ):
            i=0
            while i<len(n)-3:
                if (n[i:i+4].isdigit() and (i==0 or not n[i-1].isdigit()) and
                                       (i==len(n)-4 or not n[i+4].isdigit())):
                    g=int(n[i:i+4])
                    if g>=gmin and g<=gmax:
#                        b=((u'%s\n?=======\n1\n-=======\n+======='+
#                              u'\n\n[[Kategorio:%d]]\n'+
#                              u'============\n')%(n,g))
#                        f1.write(b)
#                        f1.flush()
                         if tg.has_key(g):
                             tg[g].append(n)
                         else:
                             tg[g]=[n]
                i+=1

    g=gmin
    while g<=gmax:
        if tg.has_key(g):
            for n in tg[g]:
                f1.write(n+u'\n')
                f1.flush()
            b=((u'?=======\n1\n-=======\n[[Kategorio:%d]]\n+=======\n'+
                              u'?=======\n1\n-=======\n+======='+
                              u'\n[[Kategorio:%d]]\n'+
                              u'============\n')%(g,g))
            f1.write(b)
            f1.flush()
        g+=1

fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
fotch = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')

#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')

try:
    main(fvh,fvih)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()


</nowiki>
Участник:Maksim-e/zjar isk.py

Материал из Википедии — свободной энциклопедии

Views

Навигация

Участие

Поиск