New Immissions/Updates:
boundless - educate - edutalab - empatico - es-ebooks - es16 - fr16 - fsfiles - hesperian - solidaria - wikipediaforschools
- wikipediaforschoolses - wikipediaforschoolsfr - wikipediaforschoolspt - worldmap -

See also: Liber Liber - Libro Parlato - Liber Musica  - Manuzio -  Liber Liber ISO Files - Alphabetical Order - Multivolume ZIP Complete Archive - PDF Files - OGG Music Files -

PROJECT GUTENBERG HTML: Volume I - Volume II - Volume III - Volume IV - Volume V - Volume VI - Volume VII - Volume VIII - Volume IX

Ascolta ""Volevo solo fare un audiolibro"" su Spreaker.
CLASSICISTRANIERI HOME PAGE - YOUTUBE CHANNEL
Privacy Policy Cookie Policy Terms and Conditions
User:Wherebot/Source - Wikipedia, the free encyclopedia

User:Wherebot/Source

From Wikipedia, the free encyclopedia

Here is the latest code as of 3/5/2007. There are no major problems with it at as of the time of writing.

Here is the source code. This has only been tested on UNIX-like systems, but it should theoretically also work on Windows. Note that the code was not intended for wide distribution, so it is not well-commented. Sorry! Also note that the code requires wget, pywikipediabot ,Yahoo's python search plugin, perl , and the Bot::BasicBot and IPC::Open2 perl modules. You may use the code under the GNU General Public License.

If you want to modify Wherebot to run on a different wiki or language, there are some modifications that need to be made. I have marked where people may want to do so on lines containing the text "#CONFIG."

Please go into edit mode to see the source of the program with proper linebreaks.

Here is the main file, cv-watch.pl. Place it where you wish:

 #!/usr/bin/perl
 use strict;
 
 #some of the IRC parts of this bot are based off of the Bot::BasicBot sample code
 
 Wherebot->new(channels => ["#en.wikipedia", "#en.wikiversity"], nick=>"Wherebot4", server => "irc.wikimedia.org")->run(); #CONFIG: change Wherebot4 to something unique
 
 package Wherebot;
 use base qw/Bot::BasicBot/;
 use IPC::Open2;
 
 sub said {
    shift(); #don't care about the first parameter
    our %hash = %{shift()};
 
    our $rawMessage = $hash{"body"};
    our $channel = $hash{"channel"};
    our $site = $channel;
    $site =~ s&#&&;
    $rawMessage =~ m#02(http://$site.org[^ ]+)#;
    our $url = $1;
#CONFIG: the next four lines are to ignore certain pages. Customize if you like
    if ($url =~ /[Tt]alk:/) {return;}
    if ($url =~ /Sandbox/) {return;}
    if ($url =~ /Articles for deletion/) {return;}
    if ($url =~ /Wikipedia:Introduction/) {return;}
    chop $rawMessage;
    if ($rawMessage =~ /N\x{03}10/) {
#CONFIG: the next four lines are to ignore certain namespaces. Customize if you like.
       if ($url =~ /User:/) {return;}
       if ($url =~ /Wikipedia:/) {return;}
       if ($url =~ /Portal:/) {return;}
       if ($url =~ /Help:/) {return;}
       if ($url =~ /Template:/) {return;}
       if ($url =~ /Category:/) {return;}
       if ($url =~ /Image:/) {return;}
 
       &act($channel, $url);
    }
 }
 
 sub act {
    our $misc = "/home/where/misc";
    our $channel = shift;
    our $url = shift;
    $url =~ s#'##g; #just in case, although this would never be necessary
    chop $url;
    our $term = `wget '$url?action=raw' -q -O - | head -n 1`;
    chomp $term;
 
    our $origUrl = $url;
    $url =~ m#/wiki/(.*)#;
    our $page = $1;
    $url .= "?action=raw";
    $url =~ s#'##g; #shouldn't be a problem, but hey, I'm paranoid
    chomp $term;
    $term = &trim($term); #get it to <100 words so yahoo doesn't go crazy
    if ($term =~ /#redirect/i) {
       return;
    }
    if ($term =~ /^\{/) {
       return;
    }
    if ($term =~ /^</) {
       return;
    }
 
    $term =~ s#'''##g;
    $term =~ s#''##g;
    $term =~ s#\[\[##g;
    $term =~ s#\]\]##g;
    $term =~ s#\*##g;
    $term =~ s#"##g; #Yahoo chokes on quotes; yes, this will probably return false matches, but it is better than the alternative
    $term =~ s#\(##g;
    $term =~ s#\)##g;
 #   if (m#([^\(\)]+)[\(\)]#) { #same thing with parenthesis
 #      $term = $1;
 #   }
 
    if (length($term) < 75) {
       return;
    }
 
    our $firstLine;
    our $n=0;
    while (1) {
       our $pid = open2(*Reader, *Writer, "python", "$misc/search.py", "-t", "web", '"' . $term . '"'); #CONFIG: CHANGE $misc/search2.py to the path to search.py from the Yahoo search API
       $firstLine = <Reader>;
      # print "($url): FL: $firstLine\n";
       if ($firstLine =~ /Internal WebService error, temporarily unavailable/ || $firstLine =~ /^Got an error/) {
         warn "Search failed; retrying\n";
         sleep 60;
         waitpid $pid, 0;
         ++$n;
         if ($n < 3) {
            next;
         }
         else {
            last;
         }
       }
       else {
         waitpid $pid, 0;
         last;
       }
    }

    if (!($firstLine =~ /^No results\s*/)) {
       <Reader>;<Reader>; #skip some lines
       our $from = <Reader>;
       $from =~ s#\s##g;
       if ($from =~ m#^http://en\.wikipedia\.org# || $from =~ m#\.gov# || $from =~ m#^http://en.wikibooks#) {
         return;
       }

       #Get the page in the proper format
       while ($page =~ /\%([0-9A-F][0-9A-F])/) {
         my $hex = eval("0x" . $1);
         my $char = chr($hex);
         my $replace = "\%$1";
         $page =~ s/$replace/$char/;
         #$page =~ s/'/\\'/g;
       }
       $page =~ s#_# #g;

       our $strippedUrl = $from;
       $strippedUrl =~ s#^http://##;
       #print "($page) copyvio from $from\n";

       if ($channel eq "#en.wikipedia") { #CONFIG: change this line according to your language and version
         chdir "$misc/pywikipedia"; #CONFIG: change this line according to where your pywikipedia directory is
       }
       print "Writing\n";
       system "nice", "-n", "10", "python", "append.py", "* [[$page]] -- [$from $strippedUrl]. Reported at ~~~~~"; #CONFIG: change wording of how Wherebot reports if you like
    }
 }
 
 sub trim { #cut parameter to <100 words
    our $in = shift;
    our @in = split / /, $in;
    our $out = "";
    our $i = 1;
    for (@in) {
       $out .= $_ . " ";
       ++$i;
       if ($i == 99) {
         last;
       }
    }
    chop $out; #get rid of last space
    return $out;
 }

For reasons unknown the me, after long periods of time, the bot may shut down. I thus recommend running it using persist.pl:

 
 #!/bin/perl

 while (1) {
    system "perl cv-watch.pl";
 }

The following file, append.py, should go in the pywikipediabot directory.

 #!/usr/bin/python
 
 import wikipedia
 import sys
 
 site = wikipedia.getSite()
 page = wikipedia.Page(site, "Wikipedia:Suspected copyright violations") #CONFIG: change this for what page to commit to
 text = page.get()
 text = text + "\n" + sys.argv[1]
 wikipedia.setAction("Adding a suspected copyright violation") #CONFIG: change edit summary
 page.put(text,minorEdit=False)

You need a user-config.py file in the pywikipediabot dir. Here's mine:

 mylang='en' #CONFIG: change for your wiki language
 usernames['wikipedia']['en']='Wherebot' #CONFIG: change for your wiki, wiki language and username
 
 maxthrottle=2
 put_throttle=3

Now run login.py in the pywikipediabot dir.

Finally, run persist.pl.

Static Wikipedia (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu