Участник:Maksim-e/zpdg zamen.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *


def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def zapiszam(f1,ti,fperv,ts,tn,kom,rezx):

    if fperv:
        b=u''
        if kom!=u'':
            b=u'|%s\n'%kom
        b+=u'%s\n'%ti
        f1.write(b)
        f1.flush()
    if ts!=u'':
        ts+=u'\n'
    b=((u'?=======\n1%s\n-=======\n%s+=======\n%s\n')%(rezx,ts,tn))
    f1.write(b)
    f1.flush()
    return 0

def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}

    tblz=[]

    pref=[]
    if 1:
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            if s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
            st=s.split(u'   ')
            if len(st)>=3 and st[0].startswith(u'1'):
                kom=u''
                if len(st)>=4:
                    kom=st[3]
                tblz.append((st[0],st[1],st[2],kom))
        fpref.close()


    n=u''
#    f0=codecs.open(fvh,'rb',encoding='utf-8')
    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = insite.redirectRegex()
    readPagesCount = 0

    sch_str=0
    sch_zam=0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if m:
            pass
        else:
#            if entry.title.startswith(u'Vikipedio:'):
#                continue
            tblz1=[]
            tkom=[]
            for r, tt, tc, kom in tblz:
#                if ( (not entry.title.startswith(tt)) and 
#                     (not entry.title.startswith(tc)) and (tt in entry.text) ):
                if (u'p' in r) and (u':' in entry.title):
                    continue
                if (tt in entry.text):
                    tblz1.append((tt,tc))
                    if kom!=u'' and (not kom in tkom):
                        tkom.append(kom)
            okom=spisvstr(tkom,u', ')
            fperv=1
            for tt, tc in tblz1:
                fperv=zapiszam(f1,entry.title,fperv,tt,tc,okom,u'')
                sch_zam+=1

            if fperv==0:
                sch_str+=1
                f1.write(u'========================\n')
                f1.flush()

    wikipedia.output(u'sch_str=%d sch_zam=%d'%(sch_str,sch_zam))


jaz = u'eo'
#vhjaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fi = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fi.endswith(u'.txt'):
    fi=fi[:len(fi)-4]
fotch = fi+u'_ot.txt'
fpref = fi+u'.txt'
fvih = fi+u'_gz.txt'

#fpref = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')

#fotch = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')

#otch = codecs.open(fotch, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)

try:
    main(jaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()


</nowiki>
Участник:Maksim-e/zpdg zamen.py

Материал из Википедии — свободной энциклопедии

Views

Навигация

Участие

Поиск