New Immissions/Updates:
boundless - educate - edutalab - empatico - es-ebooks - es16 - fr16 - fsfiles - hesperian - solidaria - wikipediaforschools
- wikipediaforschoolses - wikipediaforschoolsfr - wikipediaforschoolspt - worldmap -

See also: Liber Liber - Libro Parlato - Liber Musica  - Manuzio -  Liber Liber ISO Files - Alphabetical Order - Multivolume ZIP Complete Archive - PDF Files - OGG Music Files -

PROJECT GUTENBERG HTML: Volume I - Volume II - Volume III - Volume IV - Volume V - Volume VI - Volume VII - Volume VIII - Volume IX

Ascolta ""Volevo solo fare un audiolibro"" su Spreaker.
CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
Участник:Maksim-e/zxml stat.py — Википедия

Участник:Maksim-e/zxml stat.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys, math
import codecs
import xmlreader
from zbib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def vivod2(b):
    wikipedia.output(b)
    otch2.write(b)
    otch2.flush()

def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text


def tkateg(t):

#    pl=wikipedia.Page(mysite,u'a')
#    pl._contents=t

    sk=wikipedia.getCategoryLinks(t,mysite)
    r=[]
    for k in sk:
        n=k.title()
        dop=1
        for c in u'0123456789':
            if c in n:
                dop=0
                break
        if dop:
            r.append(n)
    return (r,len(sk))

def n_ivi(t):

    pl=wikipedia.Page(mysite,u'a')
    pl._contents=t

    oiwi = pl.interwiki()
    n=0
    for pl2 in oiwi:
        if pl2.site()!=mysite:
            n+=1
    return n


def tipsta(n,t):
    redirR = mysite.redirectRegex()
#    m = redirR.search(t)
    m = redirR.match(t)
    if m:
        return u'ali'

    katprefi=mysite.family.category_namespace(mysite.lang)
    if (u':' in n):
        if (n.startswith(katprefi+u':')):
            return u'kat'
        else:
            return u'akc'


#    if n.startswith(u'Vikipedio:'):

    if (u'{{apartigilo}}' in t) or (u'{{Apartigilo}}' in t):
        return u'apa'
    return u'art'

class Stat:
    def __init__(self):
        self.o = 0
        self.nk = 0
        self.pk = 0
        self.ni = 0
        self.pi = 0
        self.t=[0,0,0,0]
    def dobav(self,kat,ivi):
        if kat!=0:
            kat=1
        if ivi!=0:
            ivi=1
        self.o+=1
        if kat:
            self.pk+=1
        else:
            self.nk+=1
        if ivi:
            self.pi+=1
        else:
            self.ni+=1
        i=kat*2+ivi
        self.t[i]+=1
    def pech(self):
        b=(u'o    %7d\n'+
           u'nk   %7d\n'+
           u'pk   %7d\n'+
           u'ni   %7d\n'+
           u'pi   %7d\n'+
           u'nkni %7d\n'+
           u'nkpi %7d\n'+
           u'pkni %7d\n'+
           u'pkpi %7d\n')%(self.o,self.nk,self.pk,self.ni,self.pi,
                     self.t[0],self.t[1],self.t[2],self.t[3])
        return b

def main(jaz,fvh):

#    insite=wikipedia.getSite(jaz,fam = u'wikipedia')

    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    readPagesCount = 0

    maxpages=300000000

#    derkat=derevokat(dump,tblkat)

#    vivod2(u'****\n')

    nob=0

    stat={}
    stat[u'akc']=Stat()
    stat[u'ali']=Stat()
    stat[u'apa']=Stat()
    stat[u'art']=Stat()
    stat[u'kat']=Stat()


    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        if readPagesCount > maxpages:
            break

        n=entry.title
        t=entry.text
        tip=tipsta(n,t)

        nob+=1

        (tkat,nkat)=tkateg(t)

        nivi=n_ivi(t)

        if tip==u'ali' and (nkat!=0 or nivi!=0):
            vivod2(u'%s\n%s\n========================\n'%(entry.title,entry.text))

        stat[tip].dobav(nkat,nivi)

#    vivod(u'\n\n\n****\n\n')
    vivod(u'\n\n\n\n\nnob=%d\n\n\n\n'%nob)

    for tip in [u'akc',u'ali',u'apa',u'art',u'kat']:
        vivod(u'   %s\n%s\n'%(tip,stat[tip].pech()))
    



jaz = u'eo'
#jaz = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvh = wikipedia.argHandler(sys.argv[1], 'slov_iz_xml')
fvih = wikipedia.argHandler(sys.argv[2], 'slov_iz_xml')
if fvih.endswith(u'.txt'):
    fvih=fvih[:len(fvih)-4]
fotch = fvih+u'.txt'
fotch2 = fvih+u'_op.txt'
#fvih+=u'.txt'

#mysite = wikipedia.getSite()
otch = codecs.open(fotch, 'w', 'utf-8')
otch2 = codecs.open(fotch2, 'w', 'utf-8')
mysite = wikipedia.getSite(jaz)

try:
    main(jaz,fvh)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()


</nowiki>
 

Static Wikipedia (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu