New Immissions/Updates:
boundless - educate - edutalab - empatico - es-ebooks - es16 - fr16 - fsfiles - hesperian - solidaria - wikipediaforschools
- wikipediaforschoolses - wikipediaforschoolsfr - wikipediaforschoolspt - worldmap -

See also: Liber Liber - Libro Parlato - Liber Musica  - Manuzio -  Liber Liber ISO Files - Alphabetical Order - Multivolume ZIP Complete Archive - PDF Files - OGG Music Files -

PROJECT GUTENBERG HTML: Volume I - Volume II - Volume III - Volume IV - Volume V - Volume VI - Volume VII - Volume VIII - Volume IX

Ascolta ""Volevo solo fare un audiolibro"" su Spreaker.
CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
Участник:Maksim-e/zch xml kat.py — Википедия

Участник:Maksim-e/zch xml kat.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *

def vivod(b):
    wikipedia.output(b)
    otch.write(b)
    otch.flush()

def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
#    tr=u''
#    while text!=u'':
#        match = nowikiOrHtmlCommentR.search(text)
#        if not match:
#            tr+=text
#            break
#        tr += text[:match.start()]
#        text = text[match.end():]    
#    return tr
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
#        vivod(u' %d  %d\n'%(match.start(),match.end()))
        match = nowikiOrHtmlCommentR.search(text)
    return text

def xml_jaz(fvh):
    f0=codecs.open(fvh,'rb')
    zag=f0.read(1000)
    f0.close()

    jzag='xml:lang="'
    p=iskats(zag,0,jzag)
    if p==-1:
        print '!!! net \'xml:lang=""\''
        return u''
    p+=len(jzag)
    p1=iskats(zag,p,'"')
    if p1==-1:
        print '!!! net \'xml:lang=""\''
        return u''
    return zag[p:p1]

def korr_im (jaz,t,pref):

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            break

    t=perv_upper(t)

    if u':' in t:
        if ( (not t.startswith(u'Category:')) and 
                     (not t.startswith(u'Image:')) ):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                izprefi=mysite.family.image_namespace(jaz)
                if not t.startswith(izprefi+u':'):
                    return u''
    return t

iskkat=[
    u'[[Kategorio:Naskiĝ',
#    u'[[kategorio:Naskiĝ',
    u'[[Kategorio:naskiĝ',
#    u'[[kategorio:naskiĝ',
    u'[[Kategorio:Mort',
#    u'[[kategorio:Mort',
    u'[[Kategorio:mort',
#    u'[[kategorio:mort',
       ]

def main(vhjaz,fvh,fvih,fipref):
    tzfl=0
    tzst={}
    tzno={}


    pref=[]
    if 1:
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=ubr_nk_prob(s)
            if len(s)>0:
                pref.append(perv_upper(s))
        fpref.close()

    spk=[]
    spr=[]
    for k in pref:
        k=perv_upper(k)
        i=iskat(k,u':')
        if i==-1:
            vivod(u'!!! %s\n'%k)
            continue
        k0=k[:i+1]+perv_upper(k[i+1:])
        k1=u'Category'+k0[i:]
        k2=k[:i+1]+perv_lower(k[i+1:])
        k3=u'Category'+k2[i:]
        spk.append([k0,k1,k2,k3])
        spr.append([])

    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')
    katprefi=insite.family.category_namespace(vhjaz)
    kp0=katprefi[0]

#    tblredir = {}
    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = insite.redirectRegex()
    readPagesCount = 0

    for entry in dump.parse():
        readPagesCount += 1
#        if readPagesCount>=5000:
#            break
        # always print status message after 1000 pages
        if readPagesCount % 1000 == 0:
            print '%i pages read...' % readPagesCount
        m = redirR.search(entry.text)
        if (not m): # and (not u':' in entry.title):
            t=entry.text
            t=ubrkoment(t)
            t=t.replace(u'[[ ',u'[[').replace(u': ',u':')
            t=t.replace(u'_',u' ')
            t=t.replace(u'[[c',u'[[C')
            t=t.replace(u'[['+kp0.lower(),u'[['+kp0.upper())
            i=0
            while i<len(spk):
                dop=0
                for isk in spk[i]:
                    if isk in t:
                        dop=1
                        break
                if dop:
                    spr[i].append(entry.title)
                i+=1

    i=0
    while i<len(spk):
        b=u'# %s\n' % spk[i][0]
        f1.write(b)
        f1.flush()
        for isk in spr[i]:
            b=u'%s\n' % isk
            f1.write(b)
            f1.flush()
        i+=1


fvh = sys.argv[1]
fpref = sys.argv[2]
f=fpref
if f.endswith(u'.txt'):
    f=f[:len(f)-4]
fvih = f+u'_re.txt'
fotch = f+u'_ot.txt'

vhjaz=xml_jaz(fvh)
if vhjaz!=u'':
    otch = codecs.open(fotch, 'w', 'utf-8')
    main(vhjaz,fvh,fvih,fpref)

</nowiki>
 

Static Wikipedia (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu