Участник:Maksim-e/zpdg vkatc1.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, math
import codecs
import xmlreader
import zperevod as perevod
from zbib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text


def zapiszam(f1,ti,fperv,ts,tn):

    if fperv:
        b=u'|+ kategorio\n%s\n'%ti
        f1.write(b)
        f1.flush()
    if ts!=u'':
        ts+=u'\n'
    b=((u'?=======\n1ksu\n-=======\n%s+=======\n%s\n')%(ts,tn))
    f1.write(b)
    f1.flush()
    return 0


#def vich_cslov(t,tsl,min,max):
def cslov(t):
    t = wikipedia.removeLanguageLinks(t)
    t = wikipedia.removeCategoryLinks(t, mysite)
    t = ubrkoment (t)
    for c in u'`~!@#$%^&*()_+|-=\\{};\':",./<>?\n\t\r':
        t=t.replace(c,u' ')
    for c in u'[]':
        t=t.replace(c,u'')
    s=t.split(u' ')
#    tb={}
    r=[]
    n=1
    for a in s:
        if len(a)<3:
            continue
        flne=0
        for c in u'0123456789':
            if c in a:
                flne=1
                break
        if flne:
            continue
#        a=perv_upper(a.lower())
        a=a.lower()
        if a.endswith(u'n'):
            a=a[:len(a)-1]
        if a.endswith(u'j'):
            a=a[:len(a)-1]
        if a.endswith(u'a'):
            a=a[:len(a)-1]+u'o'
        if len(a)<3:
            continue
#        if tsl!=None:
#            no=tsl[a]
#            if no<min or no>max:
#                continue
#        if tb.has_key(a):
#            tb[a]+=1
#        else:
#            tb[a]=1
        r.append(a)
        n+=1
#    return (tb,n)
    return a

def tkateg(t):

#    pl=wikipedia.Page(mysite,u'a')
#    pl._contents=t

    sk=wikipedia.getCategoryLinks(t,mysite)
    r=[]
    for k in sk:
        n=k.title()
        dop=1
        for c in u'0123456789':
            if c in n:
                dop=0
                break
        if dop:
            r.append(k.title())
    return (r,len(sk))

def provsta(n,t):
    redirR = mysite.redirectRegex()
    m = redirR.search(t)
    if m:
        return 0
#    if n.startswith(u'Vikipedio:'):
    if u':' in n:
        return 0

    if (u'{{apartigilo}}' in t) or (u'{{Apartigilo}}' in t):
        return 0
    return 1

def spisvstr(s,ra):
    t=u''
    fp=0
    for a in s:
        if fp:
            t+=ra
        t+=a
        fp=1
    return t

def main(jaz,fvh,fvih,fipref):

    tblkat={}
    tblkatc={}
    fpref=codecs.open(fipref,'rb',encoding='utf-8')
    for s in fpref.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        tblkat[s]=1
        c=cslov(s)
        s1=spisvstr(c,u' ')
        tblkatc[s1]=s

    fpref.close()

    f1=codecs.open(fvih, 'w', 'utf-8')

#    insite=wikipedia.getSite(jaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    readPagesCount = 0

    maxpages=300000

    tkatuz={}
    nobst=0
    nnekst=0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        if readPagesCount > maxpages:
            break

        n=entry.title
        t=entry.text
        if not provsta(n,t):
            continue
        nobst+=1

        (tkat,nkat)=tkateg(t)
        if len(tkat)>0:
#            vivod(u'katuz %s\n'%n)
            tkatuz[n.lower()]=tkat
        if nkat==0:
            nnekst+=1

    vivod(u'\n\n\nnobst=%d  nnekst=%d\n\n\n\n'%(nobst,nnekst))


#    dump = xmlreader.XmlDump(fvh)
    readPagesCount = 0

    sch_str=0
    sch_zam=0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        if readPagesCount > maxpages:
            break

        n=entry.title
        t=entry.text
        if not provsta(n,t):
            continue

        (tkat,nkat)=tkateg(t)

        if nkat==0:
#            vivod(u'nekat %s\n'%n)
#            ttk=( n.lower().replace(u')',u' ').replace(u'(',u' ').
#                             replace(u'  ',u' ') )
#            tts=ttk.split(u' ')
            tts=cslov(n)
        s1=spisvstr(c,u' ')

#            if len(tts)<=1:
#                continue
            m=3
            while m>0:
                i=len(tts)-m
                while i>=0:
                    ts=spisvstr(cslov(tts[i:i+m]),u' ')
                    vivod(u' prob %s\n'%ts)
                    if tblkatc.has_key(ts):
                        vivod(u'+ %s   %s   %s\n'%(n,ts,tblkatc[ts]))
#                        break
                   i-=1
                m+=1

            for i in range(len(tts)):
                if i<len(tts)-2:
                    ts=tts[i]+u' '+tts[i+1]+u' '+tts[i+2]
#                    vivod(u' prob %s\n'%ts)
                    if tkatuz.has_key(ts):
                        vivod(u'+ %s   %s   %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
                        break
                if i<len(tts)-1:
                    ts=tts[i]+u' '+tts[i+1]
#                    vivod(u' prob %s\n'%ts)
                    if tkatuz.has_key(ts):
                        vivod(u'+ %s   %s   %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
                        break
                ts=tts[i]
#                vivod(u' prob %s\n'%ts)
                if tkatuz.has_key(ts):
                    vivod(u'+ %s   %s   %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
                    break




#            if fperv==0:
#                sch_str+=1
#                f1.write(u'========================\n')
#                f1.flush()
#                otch2.write(u'%s   + %s\n%s\n========================\n'%(entry.title,kat,entry.text))
#                otch2.flush()

#    vivod(u'sch_str=%d sch_zam=%d\n'%(sch_str,sch_zam))


jaz = u'eo'
#jaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fvih.endswith(u'.txt'):
    fvih=fvih[:len(fvih)-4]
fotch = fvih+u'_ot.txt'
fotch2 = fvih+u'_op.txt'
fvih+=u'.txt'

#fprefim = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fprefvse = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
fpref = u'eo_ka_sp.txt'

#fotch = u'mm_otch.txt'

#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)

try:
    main(jaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()


</nowiki>
Участник:Maksim-e/zpdg vkatc1.py

Материал из Википедии — свободной энциклопедии

Views

Навигация

Участие

Поиск