New Immissions/Updates:
boundless - educate - edutalab - empatico - es-ebooks - es16 - fr16 - fsfiles - hesperian - solidaria - wikipediaforschools
- wikipediaforschoolses - wikipediaforschoolsfr - wikipediaforschoolspt - worldmap -

See also: Liber Liber - Libro Parlato - Liber Musica  - Manuzio -  Liber Liber ISO Files - Alphabetical Order - Multivolume ZIP Complete Archive - PDF Files - OGG Music Files -

PROJECT GUTENBERG HTML: Volume I - Volume II - Volume III - Volume IV - Volume V - Volume VI - Volume VII - Volume VIII - Volume IX

Ascolta ""Volevo solo fare un audiolibro"" su Spreaker.
CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
Участник:Maksim-e/zslov iz xml.py — Википедия

Участник:Maksim-e/zslov iz xml.py

Материал из Википедии — свободной энциклопедии

# -*- coding: UTF-8 -*-

__version__='$Id:'

import wikipedia
import re, sys
import codecs
import xmlreader
from zbib_tekst import *


def ubrkoment (text):
    # Ignore tekst within nowiki tags and HTML comments
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?|<!--.*?-->', re.IGNORECASE | re.DOTALL)
    match = nowikiOrHtmlCommentR.search(text)
    while match:
        text = text[:match.start()] + text[match.end():]    
        match = nowikiOrHtmlCommentR.search(text)
    return text

def korr_im (mysite,jaz,t,pref,ftkat):

    t=perv_upper(ubr_nk_prob(t))

    for p in pref:
        if t.startswith(p):
            t=t[len(p):]
            t=perv_upper(ubr_nk_prob(t))
            break

    ns=0

    if ftkat:
        if (not t.startswith(u'Category:')):
            katprefi=mysite.family.category_namespace(jaz)
            if not t.startswith(katprefi+u':'):
                return (u'',0)
        ns=14
    else:
        if u':' in t:
#            if ( (not t.startswith(u'Category:')) and 
#                         (not t.startswith(u'Image:')) ):
#                katprefi=mysite.family.category_namespace(jaz)
#                if not t.startswith(katprefi+u':'):
#                    izprefi=mysite.family.image_namespace(jaz)
#                    if not t.startswith(izprefi+u':'):
#                        shprefi=mysite.family.template_namespace(jaz)
#                        if not t.startswith(shprefi+u':'):
#                            return u''

            katprefi=mysite.family.category_namespace(jaz)
            izprefi=mysite.family.image_namespace(jaz)
            shprefi=mysite.family.template_namespace(jaz)
            katprefi0=u'Category:'
            izprefi0=u'Image:'
            shprefi0=u'Template:'

            if t.startswith(katprefi+u':'):
                ns=14
            elif t.startswith(katprefi0):
                ns=14
                t=katprefi+u':'+t[len(katprefi0):]
            elif t.startswith(izprefi+u':'):
                ns=6
            elif t.startswith(izprefi0):
                ns=6
                t=izprefi+u':'+t[len(izprefi0):]
            elif t.startswith(shprefi+u':'):
                ns=10
            elif t.startswith(shprefi0):
                ns=10
                t=shprefi+u':'+t[len(shprefi0):]
            else:
                return (u'',0)

    return (t,ns)


def main(fvh,fvih,fipref,ftkat):
    tzfl=0
    tzst={}
    tzno={}


    pref=[]
    if fipref!=u'':
        fpref=codecs.open(fipref,'rb',encoding='utf-8')
        for s in fpref.readlines():
            if ord(s[0]) == 65279:
                s=s[1:]
            s=s.replace(u'\r',u'')
            if s[len(s)-1]==u'\n':
                s=s[:len(s)-1]
            pref.append(s)
        fpref.close()


    n=u''

    f0=codecs.open(fvh,'rb')
    zag=f0.read(1000)
    f0.close()

    jzag='xml:lang="'
    p=iskats(zag,0,jzag)
    if p==-1:
        print '!!! net \'xml:lang=""\''
        return
    p+=len(jzag)
    p1=iskats(zag,p,'"')
    if p1==-1:
        print '!!! net \'xml:lang=""\''
        return
    vhjaz=zag[p:p1]

    f1=codecs.open(fvih, 'w', 'utf-8')

    insite=wikipedia.getSite(vhjaz,fam = u'wikipedia')

    # open xml dump and read page titles out of it
    dump = xmlreader.XmlDump(fvh)
    redirR = insite.redirectRegex()
    readPagesCount = 0

    for entry in dump.parse():
        readPagesCount += 1
        # always print status message after 10000 pages
        if readPagesCount % 10000 == 0:
            print '%i pages read...' % readPagesCount
#        m = redirR.search(entry.text)
        m = redirR.match(entry.text)
        if m:
            pass
        else:
            (b0,ns0)=korr_im(insite,vhjaz,entry.title,pref,ftkat)
            if b0==u'':
                continue

            te=ubrkoment(entry.text)
            fperv=1
            interwikiR = re.compile(r'\[\[([a-z\-]+)\s?:([^\[\]\n]*)\]\]')
            for lang, pagetitle in interwikiR.findall(te):
                # Check if it really is in fact an interwiki link to a known
                # language, or if it's e.g. a category tag or an internal link
                if lang in insite.family.langs:
                    if '|' in pagetitle:
                        # ignore text after the pipe
                        pagetitle = pagetitle[:pagetitle.index('|')]
                    (b1,ns1)=korr_im (lang, pagetitle,[],ftkat)
                    if b1==u'' or ns0!=ns1:
                        continue
                    if fperv:
                        b=u'%s:%s\n' % (vhjaz,b0)
                        f1.write(b)
                        fperv=0
                    b=u'%s:%s\n' % (lang, b1)
                    f1.write(b)
            if fperv==0:
                f1.write(u'\n')
                f1.flush()

fvh = sys.argv[1]
fvih = sys.argv[2]

fpref=u''
ftkat=0
if len(sys.argv)>=4:
    fpref = sys.argv[3]
    if fpref==u'k':
        fpref=u''
        ftkat=1

try:
    main(fvh,fvih,fpref,ftkat)
except:
    wikipedia.stopme()
    raise
else:
    wikipedia.stopme()


</nowiki>
 

Static Wikipedia (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu