Static Wikipedia February 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Web Analytics
Cookie Policy Terms and Conditions Wikipedia:Duplicated sections/script - Wikipedia, the free encyclopedia

Wikipedia:Duplicated sections/script

From Wikipedia, the free encyclopedia

# Hot pipes
$| = 1;

# This script is expecting entries.txt to be a relatively database
# dump that has been pre-processed to put each page on line by itself.

# On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB
# RAM in about 20 minutes.  Not using the dupHeaders() filter will
# cause it to take probably about 5 hours or more.

# The author of this script is Christopher Beland, User:Beland on
# en.wikipedia.org.  It is hereby released into the Public Domain.
# Feel free to use it for any purpose whatsoever.

use strict;

main();

sub main
{

    my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line,
        $cur_namespace_name, $i, $j, @tokens, $printed, $chain);

    unless (-d "./todo")
    {
        mkdir "./todo";
    }

    open (ENTRIES, "<data/entries.txt")
        || die "Cannot read data/entries.txt";
    open (DUPHEAD, ">todo/duplicate-chunks.txt")
        || die "Cannot write todo/blank-pages.txt" ;

    while (<ENTRIES>)
    {
        if (++$j % 100 == 0)
        {
            print STDERR $j."\r";
        }

        $line = $_;
        
        eval("\@tokens = $line");
                
        ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk)
            = @tokens;

        unless (dupHeaders($cur_text) == 1)
        {
            next;
        }

        if ($cur_namespace == -2)
        {
            $cur_namespace_name = "Media:";
        }
        elsif ($cur_namespace == -1)
        {
            $cur_namespace_name = "Special:";
        }
        elsif ($cur_namespace == 0)
        {
            $cur_namespace_name = "";
        }
        elsif ($cur_namespace == 1)
        {
            $cur_namespace_name = "Talk:";
        }
        elsif ($cur_namespace == 2)
        {
            $cur_namespace_name = "User:";
        }
        elsif ($cur_namespace == 3)
        {
            $cur_namespace_name = "User_talk:";
        }
        elsif ($cur_namespace == 4)
        {
            $cur_namespace_name = "Wikipedia:";
        }
        elsif ($cur_namespace == 5)
        {
            $cur_namespace_name = "Wikipedia_talk:";
        }
        elsif ($cur_namespace == 6)
        {
            $cur_namespace_name = ":Image:";
        }
        elsif ($cur_namespace == 7)
        {
            $cur_namespace_name = "Image_talk:";
        }
        elsif ($cur_namespace == 8)
        {
            $cur_namespace_name = "MediaWiki:";
        }
        elsif ($cur_namespace == 9)
        {
            $cur_namespace_name = "MediaWiki_talk:";
        }
        elsif ($cur_namespace == 10)
        {
            $cur_namespace_name = "Template:";
        }
        elsif ($cur_namespace == 11)
        {
            $cur_namespace_name = "Template_talk:";
        }
        elsif ($cur_namespace == 12)
        {
            $cur_namespace_name = "Help:";
        }
        elsif ($cur_namespace == 13)
        {
            $cur_namespace_name = "Help_talk:";
        }
        elsif ($cur_namespace == 14)
        {
            $cur_namespace_name = ":Category";
        }
        elsif ($cur_namespace == 15)
        {
            $cur_namespace_name = "Category_talk:";
        }

        # Remove leading and trailing 's.
        $cur_title =~ s/^\'//;
        $cur_title =~ s/\'$//;
        # Remove leading and trailing whitespace
        $cur_title =~ s/^\s*//;
        $cur_title =~ s/\s*$//;

        $cur_text =~ s/\\n/ /g;
        $cur_text =~ s/\s+/ /g;

        my (%chains, @chunks, $i, $per, $numberRepeated);

        @chunks = split (" ", $cur_text);
        
        while (@chunks > 3)
        {
            $chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3];
            $chains{$chain}++;
            pop(@chunks);

            # Note: pop from the rear is a bjillion times more
            # efficient than unloading manually from the front.

            $i++;
        }

#       print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]] $i\n";

        $printed = 0;

        foreach $chain (keys(%chains))
        {
            if ($chains{$chain} > 1)
            {
                if ($printed == 0)
                {
                    print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]]";
                    $printed = 1;
                }
#               print DUPHEAD $chains{$chain}.": ".$chain."\n";
                $numberRepeated++
            }
        }

        if ($printed == 1)
        {
            $per = int(($numberRepeated / $i) * 100);
            print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n";
        }

    }
    close (ENTRIES);
    close (DUPHEAD);
}


sub dupHeaders
{
    my ($text, %headers, $line);
    
    $text = $_[0];
    
    unless ($text =~ m/=/)
    {
        # No headers means no duplicate headers
        return (0);
    }

    $text =~ s/\\n/\n/g;
    
    foreach $line (split ("\n", $text))
    {
        if ($line =~ m/^\s*\=/)
        {
            $headers{$line}++;          
        }
    }
    
    foreach $line (keys(%headers))
    {
        if ($headers{$line} > 1)
        {
            # Found a duplicated header
            return(1);
        }
    }

    # Didn't return, so must not have found any duplicate headers
    return(0);
}


print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`
Static Wikipedia 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2006 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu