Viquipèdia:Enllaços incorrectes a pàgines de desambiguació/Scripts

#! /usr/bin/perl

use strict;
 
my %interesting=
    ('' => {
        name            => 'article',
        filename        => 'articles.txt',
        cutoff          => 1},
     'Plantilla' => {
         name           => 'plantilla',
         filename       => 'templates.txt',
         cutoff         => 0,
         list           => 1});
 
my $exp_re=qr/\(desambiguació\)$/;
 
my @templates=split(/\n/,<<__EOT__);
Acrònim
Biografies
Desambiguació
Desambiguació 2
DesambigCurta
__EOT__
 
foreach my $template (@templates) {
    $template =~ s/^([[:alpha:]])/[$1\L$1]/;
}
 
my $tmpl_re=join('|',sort({$b cmp $a} @templates));
 
my $dab_re=qr/{{(?i:msg:)?\s*(?i:plantilla\s*:\s*)?($tmpl_re)\s*(?i:\||}})/;
 
my($ns_re,%ns_canon);
 
my $want_progress=@ARGV>0 && $ARGV[0] eq '-p';
my $last_progress=-1;
 
sub pageloop (&)
{
    my($handler)=@_;
    my($size);
    local $/="</page>\x0A";
 
    $size=-s PAGES;
    while (defined(my $page=<PAGES>)) {
        my($nstitle,$ns,$title);
 
        $page =~ /^\s*<page>/ or last;
        ($nstitle)=($page =~ m{<title>([^<]+)</title>})
            or die "Impossible de trouver le titre de la page";
        if ($nstitle =~ /^($ns_re):(.+)$/) {
            $ns=$1;
            $title=$2;
        } else {
            $ns='';
            $title=$nstitle;
        }
        $page =~ m{</text>} or next;
        substr($page,$-[0])='';
        $page =~ /<text xml:space="preserve">/
            or die "Impossible de trouver le début du texte pour la page $nstitle";
        substr($page,0,$+[0])='';
        $handler->($nstitle,$ns,$title,$page);
        if ($want_progress) {
            my $progress=int(tell(PAGES)/$size*1000);
            if ($progress!=$last_progress) {
                $last_progress=$progress;
                printf STDERR "\r0.%.3u",$progress;
            }
        }
    }
    if ($want_progress) {
        print STDERR "\r";
    }
}
 
sub mungtarget ($$$ )
{
    my(undef,$source,$sub)=@_;
 
    for my $target ($_[0]) {
        $target =~ tr/\t\n\r/   /;
        $target =~ s/^ +//;
        $target =~ s/ +$//;
        $target =~ s/ {2,}/ /g;
        if ($sub && $target =~ m{^/}) {
            $target=$source.$target;
        } elsif ($target =~ /^:*($ns_re) *: *(.+)$/i) {
            $target=$2;
            utf8::decode($target);
            $target=ucfirst($target);
            utf8::encode($target);
            $target=$ns_canon{lc($1)}.":".$target;
        } elsif ($target =~ /^:*(.+)$/i) {
            $target=$1;
            utf8::decode($target);
            $target=ucfirst($target);
            utf8::encode($target);
        } else {
            # a malformed link, usually empty brackets
        }
    }
}
 
my(%dab,%redir,@circular);
 
sub pass1 ()
{
    print STDERR "Analyse : 1er passage\n";
    {
        my($siteinfo,@namespaces);
        local $/="</siteinfo>\x0A";
 
        $siteinfo=<PAGES>;
        @namespaces=
            $siteinfo =~ m{<namespace key="-?\d+">([^<]+)</namespace>}g;
        $ns_re=join('|',map(quotemeta($_),sort({$b cmp $a} @namespaces)));
        foreach my $ns (@namespaces) {
            $ns_canon{lc($ns)}=$ns;
        }
    }
    pageloop {
        my($nstitle,$ns,$title)=splice(@_,0,3);
 
        for my $text ($_[0]) {
            my $sub=$interesting{$ns}->{subpages};
 
            if ($ns eq '' && $text =~ $dab_re) {
                $dab{$nstitle}=1;
            }
            if ($text =~ /^#redirect.*\[\[([^\]\|]+)/i) {
                my($target,$back);
 
                $target=$1;
                mungtarget($target,$nstitle,$sub);
                while ($target ne $nstitle) {
                    my($newtarget);
 
                    $newtarget=$redir{$target};
                    last unless defined($newtarget);
                    $target=$newtarget;
                }
                if ($target eq $nstitle) {
                    push(@circular,$nstitle);
                } else {
                    $redir{$nstitle}=$target;
                }
            }
        }
    };
    foreach my $target (keys(%redir)) {
        my(@chain);
 
        for (;;) {
            my $newtarget=$redir{$target};
            last unless defined($newtarget);
            push(@chain,$target);
            $target=$newtarget;
        }
        pop(@chain);
        foreach my $source (@chain) {
            $redir{$source}=$target;
        }
    }
 
    print STDERR "    ".keys(%dab)." pages d'homonymie\n";
    print STDERR "\n";
}
 
my %stats=map {
    ($_,{});
} keys(%interesting);
 
my %lists=map {
    ($_,{});
} grep {
    $interesting{$_}->{list};
} keys(%interesting);
 
sub pass2 ()
{
    my(%linked);
 
    print STDERR "Analyse : 2me passage\n";
    {
        local $/="</siteinfo>\x0A";
 
        <PAGES>;
    }
    pageloop {
        my($nstitle,$ns,$title)=splice(@_,0,3);
 
        for my $text ($_[0]) {
            my($stats,$lists,$sub);
 
            $stats=$stats{$ns};
            $lists=$lists{$ns};
            $sub=$interesting{$ns}->{subpages};
            if ($stats) {
                my(%seen);
 
                while ($text =~ /\[\[([^\]\|]+)/g) {
                    my($target,$final);
 
                    $target=$1;
                    mungtarget($target,$nstitle,$sub);
                    next if $target =~ $exp_re;
                    $final=$redir{$target};
                    $final=$target unless defined($final);
                    if ($dab{$final} && !$seen{$final}++) {
                        $linked{$final}=1;
                        $stats->{$final}++;
                        if ($lists) {
                            push(@{$lists->{$final}},$nstitle);
                        }
                    }
                }
            }
        }
    };
    print STDERR "    ".keys(%linked)." liens vers les pages d'homonymie\n";
    foreach my $ns (sort(keys(%stats))) {
        print STDERR ("    ".keys(%{$stats{$ns}})." dans l'espace de nom ".
                      $interesting{$ns}->{name}."\n");
    }
    print STDERR "\n";
}
 
sub wikilink ($ )
{
    my($target)=@_;
 
    if (exists($redir{$target})) {
        "[{{SERVER}}{{localurl:$target|redirect=no}} $target]";
    } elsif ($target =~ m{/\.{1,2}(?:$|/)}) {
        "[{{SERVER}}{{localurl:$target}} $target]";
    } elsif ($target =~ m{^/}) {
        "[[:$target]]";
    } else {
        "[[$target]]";
    }
}
 
sub report ()
{
    print STDERR "Génération du rapport\n";
 
    foreach my $target (@circular) {
        $redir{$target}=$target;
    }
 
    while (my($ns,$stats)=each(%stats)) {
        my($filename,$cutoff)=@{$interesting{$ns}}{qw(filename cutoff)};
        my $lists=$lists{$ns};
        my @nstitles=sort {
            $stats->{$b}<=>$stats->{$a} || $a cmp $b;
        } grep {
            $stats->{$_}>=$cutoff;
        } keys(%{$stats});
        my $total=0;
 
        open(REPORT,'>',$filename)
            or die "Impossible de créer $filename: $!";
        binmode(REPORT);
        print REPORT "\xEF\xBB\xBF";
        foreach my $nstitle (@nstitles) {
            $total+=$stats->{$nstitle};
        }
        print REPORT "Nombre total de liens : $total\n";
        foreach my $nstitle (@nstitles) {
            print REPORT ("# ",wikilink($nstitle),": ",$stats->{$nstitle},
                          " [[Especial:Whatlinkshere/",$nstitle,"|liens]]\n");
            if ($lists) {
                foreach my $source (sort(@{$lists->{$nstitle}})) {
                    print REPORT "#* ",wikilink($source),"\n";
                }
            }
        }
        close(REPORT);
        print STDERR "    ".@nstitles." entrées ajoutées à $filename\n";
    }
 
    if (@circular) {
        @circular=sort(@circular);
        open(REPORT,'>','circular.txt')
            or die "Impossible de créer circular.txt: $!";
        binmode(REPORT);
        print REPORT "\xEF\xBB\xBF";
        foreach my $target (@circular) {
            print REPORT "* ",wikilink($target),"\n";
        }
        close(REPORT);
        print STDERR "    ".@circular." entrées ajoutées à circular.txt\n";
    } else {
        unlink('circular.txt');
    }
}
 
open(PAGES,'<','cawiki-latest-pages-articles.xml')
    or die "Impossible d'ouvrir cawiki-latest-pages-articles.xml: $!";
binmode(PAGES);
pass1();
seek(PAGES,0,0);
pass2();
close(PAGES);
report();