319 lines
8.1 KiB
Perl
319 lines
8.1 KiB
Perl
#!/usr/bin/perl
|
||
|
||
use strict;
|
||
use Graph::Easy;
|
||
use LWP;
|
||
use HTML::TokeParser;
|
||
use utf8;
|
||
use Getopt::Long;
|
||
use Encode;
|
||
use Data::Dumper;
|
||
|
||
my $VERSION = 0.03;
|
||
|
||
# things that shouldn't be looked at
|
||
my %bad = map { $_ => 1 } qw/
|
||
Wikipedia Image Talk Help Template Portal Special User Category
|
||
Wikipedia Bild Diskussion Hilfe Vorlage Portal Spezial Benutzer Kategorie
|
||
Wikipédia Image Discuter Modèle Mod%C3%A9le Aide Utilisateur Catégorie Cat%C3%A9gorie
|
||
/;
|
||
# do not crawl these:
|
||
my $skip = qr/\((disambiguation|Begriffsklärung|Homonymie)\)/i;
|
||
# to figure out redirections
|
||
my $redir = qr/(Weitergeleitet von|Redirected from|Redirig. depuis).*?title="(.*?)"/i;
|
||
|
||
# the default settings are defined in get_options()
|
||
# option handling
|
||
my $help_requested = 0; $help_requested = 1 if @ARGV == 0;
|
||
|
||
my $opt = get_options();
|
||
|
||
# error?
|
||
$help_requested = 1 if !ref($opt);
|
||
|
||
# no error and --help was specified
|
||
$help_requested = 2 if ref($opt) && $opt->{help} ne '';
|
||
|
||
my $copyright = "wikicrawl v$VERSION (c) by Tels 2008. "
|
||
."Released under the GPL 2.0 or later.\n\n"
|
||
."After a very cool idea by 'integral' on forum.xkcd.com. Thanx! :)\n\n";
|
||
|
||
if (ref($opt) && $opt->{version} != 0)
|
||
{
|
||
print $copyright;
|
||
print "Running under Perl v$].\n\n";
|
||
exit 2;
|
||
}
|
||
|
||
if ($help_requested > 0)
|
||
{
|
||
print STDERR $copyright;
|
||
require Pod::Usage;
|
||
if ($help_requested > 1 && $Pod::Usage::VERSION < 1.35)
|
||
{
|
||
# The way old Pod::Usage executes "perldoc" might fail:
|
||
system('perldoc', $0);
|
||
exit 2;
|
||
}
|
||
Pod::Usage::pod2usage( { -exitval => 2, -verbose => $help_requested } );
|
||
}
|
||
|
||
my $verbose = $opt->{verbose};
|
||
|
||
output ($copyright);
|
||
|
||
my $graph = Graph::Easy->new();
|
||
# set some default attributes on the graph
|
||
$graph->set_attribute('node','shape',$opt->{nodeshape});
|
||
$graph->set_attribute('node','font-size','80%');
|
||
$graph->set_attribute('edge','arrowstyle','filled');
|
||
$graph->set_attribute('graph','label',"Wikipedia map for $opt->{root}");
|
||
$graph->set_attribute('graph','font-size', '200%');
|
||
$graph->set_attribute('graph','comment', "Created with wikicrawl.pl v$VERSION");
|
||
|
||
output ("Using the following settings:\n");
|
||
print Data::Dumper->Dump([$opt], ['opt']);
|
||
|
||
# don't crawl stuff twice
|
||
my %visitedLinks;
|
||
# re-use the UserAgent object
|
||
my $ua = LWP::UserAgent->new();
|
||
#$ua->agent("WikiCrawl/$VERSION - " . $ua->_agent . " - vGraph::Easy $Graph::Easy::VERSION");
|
||
|
||
# count how many we have done
|
||
my $nodes = 0;
|
||
|
||
# enable UTF-8 output
|
||
binmode STDERR, ':utf8';
|
||
binmode STDOUT, ':utf8';
|
||
|
||
# push the first node on the stack
|
||
my @todo = [$opt->{root},0];
|
||
# and work on it (this will take one off and then push more nodes on it)
|
||
while (@todo && crawl()) { };
|
||
|
||
my $file = "wikicrawl-$opt->{lang}.txt";
|
||
output ("Generating $file:\n");
|
||
open(my $DATA, ">", "$file") or die("Could not write to '$file': $!");
|
||
binmode ($DATA,':utf8');
|
||
print $DATA $graph->as_txt();
|
||
close $DATA;
|
||
output ("All done.\n");
|
||
|
||
my $png = $file; $png =~ s/.txt/.png/;
|
||
|
||
output ("Generating $png:\n");
|
||
`perl -Ilib bin/graph-easy --png --renderer=$opt->{renderer} $file`;
|
||
|
||
output ("All done.\n");
|
||
|
||
########################################################################################
|
||
|
||
# main crawl routine
|
||
sub crawl {
|
||
no warnings 'recursion';
|
||
|
||
# all done?
|
||
return if @todo == 0;
|
||
my ($name,$depth) = ($todo[0]->[0],$todo[0]->[1]);
|
||
shift @todo;
|
||
|
||
my $page = "http://$opt->{lang}.wikipedia.org/wiki/$name";
|
||
|
||
# limit depth
|
||
return if $depth + 1 > $opt->{maxdepth};
|
||
# already did as many nodes?
|
||
return if $opt->{maxnodes} > 0 && $nodes > $opt->{maxnodes};
|
||
# skip this page
|
||
return 1 if exists $visitedLinks{$page};
|
||
|
||
# crawl page
|
||
my $res = $ua->request(HTTP::Request->new(GET => $page));
|
||
return 1 unless $res->is_success();
|
||
|
||
# remove the " - Wikipedia" (en) or " – Wikipedia" (de) from the title
|
||
my $title = decode('utf8',$res->title); # convert to UTF-8
|
||
$title =~ s/ [–-] Wikip[ée]dia.*//;
|
||
return 1 if $title =~ $skip; # no disambiguation pages
|
||
|
||
# tels: not sure when/why these happen:
|
||
print STDERR "# $title ",$res->title()," $page\n" if $title eq '';
|
||
|
||
output ("Crawling node #$nodes '$title' at depth $depth\n"); $nodes++;
|
||
|
||
# set flag
|
||
$visitedLinks{$page} = undef;
|
||
my $content = $res->content;
|
||
|
||
# parse anchors
|
||
my $parser = HTML::TokeParser->new(\$content) or die("Could not parse page.");
|
||
|
||
# handle redirects:
|
||
$content = decode('utf-8', $content);
|
||
$content =~ $redir; my $old = $2;
|
||
|
||
if ($old)
|
||
{
|
||
output (" Redirected to '$title' from '$old'\n");
|
||
# find the node named "$old" (at the same time adding it if it didn't exist yet)
|
||
my $source = $graph->add_node($old);
|
||
# and mention the redirect in the label
|
||
$source->set_attribute('label', "$old\\n($title)");
|
||
# now force edges to come from that node
|
||
$title = $old;
|
||
}
|
||
|
||
# iterate over all links
|
||
for(my $i = 0; (my $token = $parser->get_tag("a")) && ($i < $opt->{maxspread} || $opt->{maxspread} == 0);)
|
||
{
|
||
my $url = $token->[1]{href};
|
||
my $alt = $token->[1]{title};
|
||
|
||
next unless defined $url;
|
||
# we do not crawl these:
|
||
next if $url !~ m/^\/wiki\//; # no pages outside of wikipedia
|
||
next if $alt =~ $skip; # no disambiguation pages
|
||
next if $alt =~ m/\[/; # no brackets
|
||
|
||
my @chunks = split ":", substr(decode('utf-8',$url), 6); # extract special pages, if any
|
||
next if exists $bad{$chunks[0]}; # no bad pages
|
||
|
||
$i++;
|
||
if ($title ne $alt)
|
||
{
|
||
output (" Adding link from '$title' to '$alt'\n", 1);
|
||
my ($from,$to,$edge) = $graph->add_edge_once($title,$alt);
|
||
if (defined $to)
|
||
{
|
||
my $old_depth = $to->raw_attribute('rank');
|
||
if (!$old_depth)
|
||
{
|
||
my $color = sprintf("%i", (360 / $opt->{maxdepth}) * ($depth));
|
||
$to->set_attribute('fill', 'hsl(' .$color.',1,0.7)');
|
||
# store rank
|
||
$to->set_attribute('rank', $depth+1);
|
||
}
|
||
}
|
||
}
|
||
my $u = $url; $u =~ s/^\/wiki\///;
|
||
push @todo, [$u,$depth+1];
|
||
}
|
||
|
||
# continue
|
||
return 1;
|
||
}
|
||
|
||
sub get_options
|
||
{
|
||
my $opt = {};
|
||
$opt->{help} = '';
|
||
$opt->{version} = 0;
|
||
# max depth to crawl
|
||
$opt->{maxdepth} = 4;
|
||
# max number of links per node
|
||
$opt->{maxspread} = 5;
|
||
# stop after so many nodes, -1 to disable
|
||
$opt->{maxnodes} = -1;
|
||
# language
|
||
$opt->{lang} = 'en';
|
||
# root node
|
||
$opt->{root} = 'Xkcd';
|
||
$opt->{renderer} = 'neato';
|
||
$opt->{nodeshape} = 'rect';
|
||
my @o = (
|
||
"language=s" => \$opt->{lang},
|
||
"root=s" => \$opt->{root},
|
||
"maxdepth=i" => \$opt->{maxdepth},
|
||
"maxspread=i" => \$opt->{maxspread},
|
||
"maxnodes=i" => \$opt->{maxnodes},
|
||
"version" => \$opt->{version},
|
||
"help|?" => \$opt->{help},
|
||
"verbose" => \$opt->{verbose},
|
||
"nodeshape" => \$opt->{nodeshape},
|
||
);
|
||
return unless Getopt::Long::GetOptions (@o);
|
||
$opt;
|
||
}
|
||
|
||
sub output
|
||
{
|
||
my ($txt, $level) = @_;
|
||
|
||
$level |= 0;
|
||
|
||
print STDERR $txt if $opt->{verbose} || $level == 0;
|
||
}
|
||
|
||
=pod
|
||
|
||
=head1 NAME
|
||
|
||
wikicrawl - crawl Wikipedia to generate graph from the found article links
|
||
|
||
=head1 SYNOPSIS
|
||
|
||
Crawl wikipedia and create a L<Graph::Easy> text describing the inter-article links
|
||
that were found during the crawl.
|
||
|
||
At least one argument must be given to start:
|
||
|
||
perl examples/wikicrawl.pl --lang=fr
|
||
|
||
=head1 ARGUMENTS
|
||
|
||
Here are the options:
|
||
|
||
=over 12
|
||
|
||
=item --help
|
||
|
||
Print the full documentation, not just this short overview.
|
||
|
||
=item --version
|
||
|
||
Write version info and exit.
|
||
|
||
=item --language
|
||
|
||
Select the language of Wikipedia that we should crawl. Currently supported
|
||
are 'de', 'en' and 'fr'. Default is 'en'.
|
||
|
||
=item --root
|
||
|
||
Set the root node where the crawl should start. Default is of course 'Xkcd'.
|
||
|
||
=item --maxdepth
|
||
|
||
The maximum depth the crawl should go. Please select small values under 10. Default is 4.
|
||
|
||
=item --maxspread
|
||
|
||
The maximum number of links we follow per article. Please select small values under 10. Default is 5.
|
||
|
||
=item --maxnodes
|
||
|
||
The maximum number of nodes we crawl. Set to -1 (default) to disable.
|
||
|
||
=back
|
||
|
||
=head1 SEE ALSO
|
||
|
||
L<http://forums.xkcd.com/viewtopic.php?f=2&t=21300&p=672184> and
|
||
L<Graph::Easy>.
|
||
|
||
=head1 LICENSE
|
||
|
||
This library is free software; you can redistribute it and/or modify
|
||
it under the terms of the GPL.
|
||
|
||
See the LICENSE file of Graph::Easy for a copy of the GPL.
|
||
|
||
X<license>
|
||
|
||
=head1 AUTHOR
|
||
|
||
Copyright (C) 2008 by integral L<forum.xkcd.com>
|
||
Copyright (C) 2008 by Tels L<http://bloodgate.com>
|
||
|
||
=cut
|