Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Web Analytics
Cookie Policy Terms and Conditions Участник:Maksim-e/zpdg vkatc1.py — Википедия

Участник:Maksim-e/zpdg vkatc1.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, math
import codecs
import xmlreader
import zperevod as perevod
from zbib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text


def zapiszam(f1,ti,fperv,ts,tn):

    if fperv:
        b=u'|+ kategorio\n%s\n'%ti
        f1.write(b)
        f1.flush()
    if ts!=u'':
        ts+=u'\n'
    b=((u'?=======\n1ksu\n-=======\n%s+=======\n%s\n')%(ts,tn))
    f1.write(b)
    f1.flush()
    return 0


#def vich_cslov(t,tsl,min,max):
def cslov(t):
    t = wikipedia.removeLanguageLinks(t)
    t = wikipedia.removeCategoryLinks(t, mysite)
    t = ubrkoment (t)
    for c in u'`~!@#$%^&*()_+|-=\\{};\':",./<>?\n\t\r':
        t=t.replace(c,u' ')
    for c in u'[]':
        t=t.replace(c,u'')
    s=t.split(u' ')
#    tb={}
    r=[]
    n=1
    for a in s:
        if len(a)<3:
            continue
        flne=0
        for c in u'0123456789':
            if c in a:
                flne=1
                break
        if flne:
            continue
#        a=perv_upper(a.lower())
        a=a.lower()
        if a.endswith(u'n'):
            a=a[:len(a)-1]
        if a.endswith(u'j'):
            a=a[:len(a)-1]
        if a.endswith(u'a'):
            a=a[:len(a)-1]+u'o'
        if len(a)<3:
            continue
#        if tsl!=None:
#            no=tsl[a]
#            if no<min or no>max:
#                continue
#        if tb.has_key(a):
#            tb[a]+=1
#        else:
#            tb[a]=1
        r.append(a)
        n+=1
#    return (tb,n)
    return a

def tkateg(t):

#    pl=wikipedia.Page(mysite,u'a')
#    pl._contents=t

    sk=wikipedia.getCategoryLinks(t,mysite)
    r=[]
    for k in sk:
        n=k.title()
        dop=1
        for c in u'0123456789':
            if c in n:
                dop=0
                break
        if dop:
            r.append(k.title())
    return (r,len(sk))

def provsta(n,t):
    redirR = mysite.redirectRegex()
    m = redirR.search(t)
    if m:
        return 0
#    if n.startswith(u'Vikipedio:'):
    if u':' in n:
        return 0

    if (u'{{apartigilo}}' in t) or (u'{{Apartigilo}}' in t):
        return 0
    return 1

def spisvstr(s,ra):
    t=u''
    fp=0
    for a in s:
        if fp:
            t+=ra
        t+=a
        fp=1
    return t

def main(jaz,fvh,fvih,fipref):

    tblkat={}
    tblkatc={}
    fpref=codecs.open(fipref,'rb',encoding='utf-8')
    for s in fpref.readlines():
        if ord(s[0]) == 65279:
            s=s[1:]
        s=ubr_nk_prob(s)
        tblkat[s]=1
        c=cslov(s)
        s1=spisvstr(c,u' ')
        tblkatc[s1]=s

    fpref.close()

    f1=codecs.open(fvih, 'w', 'utf-8')

#    insite=wikipedia.getSite(jaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    readPagesCount = 0

    maxpages=300000

    tkatuz={}
    nobst=0
    nnekst=0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        if readPagesCount > maxpages:
            break

        n=entry.title
        t=entry.text
        if not provsta(n,t):
            continue
        nobst+=1

        (tkat,nkat)=tkateg(t)
        if len(tkat)>0:
#            vivod(u'katuz %s\n'%n)
            tkatuz[n.lower()]=tkat
        if nkat==0:
            nnekst+=1

    vivod(u'\n\n\nnobst=%d  nnekst=%d\n\n\n\n'%(nobst,nnekst))


#    dump = xmlreader.XmlDump(fvh)
    readPagesCount = 0

    sch_str=0
    sch_zam=0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        if readPagesCount > maxpages:
            break

        n=entry.title
        t=entry.text
        if not provsta(n,t):
            continue

        (tkat,nkat)=tkateg(t)

        if nkat==0:
#            vivod(u'nekat %s\n'%n)
#            ttk=( n.lower().replace(u')',u' ').replace(u'(',u' ').
#                             replace(u'  ',u' ') )
#            tts=ttk.split(u' ')
            tts=cslov(n)
        s1=spisvstr(c,u' ')

#            if len(tts)<=1:
#                continue
            m=3
            while m>0:
                i=len(tts)-m
                while i>=0:
                    ts=spisvstr(cslov(tts[i:i+m]),u' ')
                    vivod(u' prob %s\n'%ts)
                    if tblkatc.has_key(ts):
                        vivod(u'+ %s   %s   %s\n'%(n,ts,tblkatc[ts]))
#                        break
                   i-=1
                m+=1

            for i in range(len(tts)):
                if i<len(tts)-2:
                    ts=tts[i]+u' '+tts[i+1]+u' '+tts[i+2]
#                    vivod(u' prob %s\n'%ts)
                    if tkatuz.has_key(ts):
                        vivod(u'+ %s   %s   %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
                        break
                if i<len(tts)-1:
                    ts=tts[i]+u' '+tts[i+1]
#                    vivod(u' prob %s\n'%ts)
                    if tkatuz.has_key(ts):
                        vivod(u'+ %s   %s   %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
                        break
                ts=tts[i]
#                vivod(u' prob %s\n'%ts)
                if tkatuz.has_key(ts):
                    vivod(u'+ %s   %s   %s\n'%(n,ts,spisvstr(tkatuz[ts]),u', '))
                    break




#            if fperv==0:
#                sch_str+=1
#                f1.write(u'========================\n')
#                f1.flush()
#                otch2.write(u'%s   + %s\n%s\n========================\n'%(entry.title,kat,entry.text))
#                otch2.flush()

#    vivod(u'sch_str=%d sch_zam=%d\n'%(sch_str,sch_zam))


jaz = u'eo'
#jaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fvih.endswith(u'.txt'):
    fvih=fvih[:len(fvih)-4]
fotch = fvih+u'_ot.txt'
fotch2 = fvih+u'_op.txt'
fvih+=u'.txt'

#fprefim = wikipedia.argHandler(sys.argv[3], 'slov_iz_xml')
#fprefvse = wikipedia.argHandler(sys.argv[4], 'slov_iz_xml')
fpref = u'eo_ka_sp.txt'

#fotch = u'mm_otch.txt'

#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)

try:
    main(jaz,fvh,fvih,fpref)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()


</nowiki>
 
Static Wikipedia 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu