New Immissions/Updates:
boundless - educate - edutalab - empatico - es-ebooks - es16 - fr16 - fsfiles - hesperian - solidaria - wikipediaforschools
- wikipediaforschoolses - wikipediaforschoolsfr - wikipediaforschoolspt - worldmap -

See also: Liber Liber - Libro Parlato - Liber Musica  - Manuzio -  Liber Liber ISO Files - Alphabetical Order - Multivolume ZIP Complete Archive - PDF Files - OGG Music Files -

PROJECT GUTENBERG HTML: Volume I - Volume II - Volume III - Volume IV - Volume V - Volume VI - Volume VII - Volume VIII - Volume IX

Ascolta ""Volevo solo fare un audiolibro"" su Spreaker.
CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
User:PDFbot/pdfbot.py - Wikipedia, the free encyclopedia

User:PDFbot/pdfbot.py

From Wikipedia, the free encyclopedia

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This script can be used to update links transcluded using the {{PDFlink}} template.

Syntax: python pdfbot.py [-ref: TemplateName]

Command line options:

-file:       Update article pages listed in a text file.
-ref:        Update article pages transcluding from a given page.
-cat:        Update artcile pages from the given category.

"""

import re, sys, httplib, time
import wikipedia, pagegenerators, login, config, catlib
from urllib2 import urlparse

# Define global variables
writeDelay = 60 # seconds
readDelay  = 15 #seconds
httpDebug  = 0
userAgent  = 'PDFbot (http://en.wikipedia.org/wiki/User:PDFbot)'

def whichURL(location):
        redirectCounter = 6
        try:
                while (redirectCounter > 0 and location is not None):
                        (scheme, site, path, args, query, frag) = urlparse.urlparse(location)
                        path = path + args + query
                        conn = httplib.HTTPConnection(site)
                        conn.set_debuglevel(httpDebug)
                        conn.putrequest('HEAD', path)
                        conn.putheader('User-Agent', userAgent)
                        conn.endheaders()
                        
                        response = conn.getresponse()
                        location = response.msg.getheader('location')
                        
                        redirectCounter -= 1
                        if(redirectCounter > 0 and location is not None):
                                conn.close()
                                wikipedia.output( u'Redirecting to %s' % location )
                                
                content_length = response.msg.getheader('content-length')
                content_type   = response.msg.getheader('content-type')
                response_code  = response.status
                conn.close()
                return ( [site, path, content_length, content_type] )
        except:
                wikipedia.output(u'Error with URL')
                return ( [None, None, None, None] )

# Convert the byte count to a human readable value
def binary_notation(size):
        a = float(size)
        exponent = 0
        while a >= 1000. :
                a /= 1024.
                exponent += 3
        prefix = ['bytes', '[[Kibibyte|KiB]]', '[[Mebibyte|MiB]]', '[[Gibibyte|GiB]]']

        # Truncate and remove trailing dot
        byteSigs = str(a)[:4]
        if (byteSigs.endswith('.')):
                byteSigs = byteSigs[:3]
        return ( byteSigs + ' ' + prefix[exponent / 3] )

def update_size_paramter(template_text):
        location    = re.search(r'(http[^] |}]*)', template_text).group(1)
        prefix_text = re.search(r'(\{\{[^|]*\|[^|}]*)[^}]*\}\}', template_text).group(1) 
        
        if (re.findall(r'=', template_text)):
                parameter_prefix = '|2='
        else:
                parameter_prefix = '|'
        
        # Fix indirect HTML character refernces
        location = re.sub(r'&\#61;', r'=', location)
        location = re.sub(r'&', r'&', location)
        
        (site, path, content_length, content_type) = whichURL(location)
        if (content_length is not None and int(content_length) > 16):
                # I should really put in 404 error handling code, but this has been working just fine.
                if (re.findall(r'pdf|octet-stream', content_type)):
                        return prefix_text + parameter_prefix + binary_notation(content_length) + '<!-- ' + content_type + ', ' + str(content_length) +" bytes -->}}"
                else:
                        wikipedia.output(u'Unusual content_type: ' + content_type)
        return template_text    
        # If anything else return the template_text back

def process_article(page):
                wikitext = page.get()
                
                # Fix Casing (Reduces the number of possible expressions)
                wikitext = re.sub(r'\{\{ *(Template:|template:|)(PDF|Pdf|pdf)', r'{{PDF', wikitext)
                
                # State point.  Count any changes as needing an update if they're after this line
                state0 = wikitext
                
                # Convert hard coded pdf links  (ex: [http link] (pdf) )
                wikitext = re.sub(r'(\[http[^]]*\]) *\((\[\[[^|\]]*|)(PDF|pdf)(\]\]|)\)', r'{{PDFlink|\1}}', wikitext)
                
                # Convert from the old style to the new style (ex: [http link] {{PDF}} or {{PDF}} [http link] )
                wikitext = re.sub(r'(\(|)\{\{(PDFlink|PDF)\}\}(\)|) *(\[http[^]]*\])', r'{{\2|\4}}', wikitext)
                wikitext = re.sub(r'("|)(\[http[^]]*\])("|)([^a-zA-Z(]*) *(\(|)\{\{(PDFlink|PDF)\}\}(\)|)', r'{{\6|\2}}\4', wikitext)
                                
                # Remove PDFlink from citation templates
                wikitext = re.sub(r'(format *= *)(\(|)\{\{PDF(|link)\}\}(\(|)', r'\1PDF', wikitext)
                wikitext = re.sub(r'(\{\{(Cite|cite)[^}]*)(}}[^a-zA-Z]*)(\(|)\{\{(PDF|PDFlink)\}\}(\)|)', r'\1 |format=PDF\3', wikitext)
                wikitext = re.sub(r'(\{\{.ite web[^}]*) *(\(|)\{\{(PDF|PDFlink)\}\}(\)|) *([^}]*\}\})', r'\1 |format=PDF \5', wikitext)
                
                # Fix equal sign problem
                wikitext = re.sub(r'(PDF|PDFlink)\|(1=|)([^{|}]*=[^{|}]*)', r'\1|1=\3', wikitext)
                
                state1 = wikitext
                m = re.findall(r'\{\{PDF[^|}]*\|[^}]*\}\}', wikitext)
                
                for s in m:
                        if (re.findall(r'http', s)):
                                replacetext = update_size_paramter(s)
                                # Uncomment the bellow line to see the replacement text
#                               wikipedia.output(replacetext)
                                wikitext    = re.sub(re.escape(s), replacetext, wikitext)
                
                if (wikitext == state1):
                        EditMsg = 'Corrected use of {{[[Template:PDFlink|PDFlink]]}}'
                else:
                        EditMsg = 'Updating filesize for external links tagged with {{[[Template:PDFlink|PDFlink]]}}'
                wikipedia.setAction(EditMsg)
                        
                # If the text has changed at all since the state point, upload it
                if (wikitext != state0):
                        wikipedia.output(u'Page change by %s bytes.  Writing new version.' % str(len(wikitext)-len(state0)))
                        page.put(wikitext)      
                        
                        # Pause to reduce load on the servers
                        time.sleep(writeDelay)
                else:
                        time.sleep(readDelay)
        
def main():
        site  = wikipedia.getSite()
        
        for arg in wikipedia.handleArgs():
                if (arg.startswith('-ref:')):
                        referredPage = wikipedia.Page(site, arg[5:])
                        gen = pagegenerators.ReferringPageGenerator(referredPage)
                elif (arg.startswith('-file:')):
                        gen = pagegenerators.TextfilePageGenerator(arg[6:])
                elif (arg.startswith('-cat:')):
                        cat = catlib.Category(site, arg[5:])
                        gen = pagegenerators.CategorizedPageGenerator(cat)
                else:
                        wikipedia.showHelp(u'pdfbot')
                        return
        
        wikipedia.output(u'Read delay is %s seconds.' % readDelay)
        wikipedia.output(u'Write delay is %s seconds.\n' % writeDelay)
        
        for page in gen:
                if (not re.findall(r'(User|Wikipedia|Image|MediaWiki|Template|Help|Category|Portal|Talk)(| talk):', page.title())):
                        process_article(page)

if __name__ == "__main__":
        try:
                main()
        finally:
                wikipedia.stopme()

Static Wikipedia (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu