Участник:Maksim-e/zch xml kat.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
#    tr=u''
#    while text!=u'':
#        match = nowikiOrHtmlCommentR.search(text)
#        if not match:
#            tr+=text
#            break
#        tr += text[:match.start()]
#        text = text[match.end():]    
#    return tr
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
#        vivod(u' %d  %d\n'%(match.start(),match.end()))
        match = nowikiOrHtmlCommentR.search(text)
    return text

def xml_jaz(fvh):
    f0=codecs.open(fvh,'rb')
    zag=f0.read(1000)
    f0.close()

    jzag='xml:lang="'
    p=iskats(zag,0,jzag)
    if p==-1:
        print '!!! net \'xml:lang=""\''
        return u''
    p+=len(jzag)
    p1=iskats(zag,p,'"')
    if p1==-1:
        print '!!! net \'xml:lang=""\''
        return u''
    return zag[p:p1]

def korr_im (jaz,t,pref):

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            break

    t=perv_upper(t)

    if u':' in t:
        if ( (not t.startswith(u'Category:')) and 
                     (not t.startswith(u'Image:')) ):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                izprefi=mysite.family.image_namespace(jaz)
                if not t.startswith(izprefi+u':'):
                    return u''
    return t

iskkat=[
    u'[[Kategorio:Naskiĝ',
#    u'[[kategorio:Naskiĝ',
    u'[[Kategorio:naskiĝ',
#    u'[[kategorio:naskiĝ',
    u'[[Kategorio:Mort',
#    u'[[kategorio:Mort',
    u'[[Kategorio:mort',
#    u'[[kategorio:mort',
       ]

def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}


    pref=[]
    if 1:
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=ubr_nk_prob(s)
            if len(s)>0:
                pref.append(perv_upper(s))
        fpref.close()

    spk=[]
    spr=[]
    for k in pref:
        k=perv_upper(k)
        i=iskat(k,u':')
        if i==-1:
            vivod(u'!!! %s\n'%k)
            continue
        k0=k[:i+1]+perv_upper(k[i+1:])
        k1=u'Category'+k0[i:]
        k2=k[:i+1]+perv_lower(k[i+1:])
        k3=u'Category'+k2[i:]
        spk.append([k0,k1,k2,k3])
        spr.append([])

    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
    katprefi=insite.family.category_namespace(vhjaz)
    kp0=katprefi[0]

#    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = insite.redirectRegex()
    readPagesCount = 0

    for entry in dump.parse():
        readPagesCount += 1
#        if readPagesCount>=5000:
#            break
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if (not m): # and (not u':' in entry.title):
            t=entry.text
            t=ubrkoment(t)
            t=t.replace(u'[[ ',u'[[').replace(u': ',u':')
            t=t.replace(u'_',u' ')
            t=t.replace(u'[[c',u'[[C')
            t=t.replace(u'[['+kp0.lower(),u'[['+kp0.upper())
            i=0
            while i<len(spk):
                dop=0
                for isk in spk[i]:
                    if isk in t:
                        dop=1
                        break
                if dop:
                    spr[i].append(entry.title)
                i+=1

    i=0
    while i<len(spk):
        b=u'# %s\n' % spk[i][0]
        f1.write(b)
        f1.flush()
        for isk in spr[i]:
            b=u'%s\n' % isk
            f1.write(b)
            f1.flush()
        i+=1


fvh = sys.argv[1]
fpref = sys.argv[2]
f=fpref
if f.endswith(u'.txt'):
    f=f[:len(f)-4]
fvih = f+u'_re.txt'
fotch = f+u'_ot.txt'

vhjaz=xml_jaz(fvh)
if vhjaz!=u'':
    otch = codecs.open(fotch, 'w', 'utf-8')
    main(vhjaz,fvh,fvih,fpref)

</nowiki>
Участник:Maksim-e/zch xml kat.py

Материал из Википедии — свободной энциклопедии

Views

Навигация

Участие

Поиск