#!/usr/bin/perl use strict; use Graph::Easy; use LWP; use HTML::TokeParser; use utf8; use Getopt::Long; use Encode; use Data::Dumper; my $VERSION = 0.03; # things that shouldn't be looked at my %bad = map { $_ => 1 } qw/ Wikipedia Image Talk Help Template Portal Special User Category Wikipedia Bild Diskussion Hilfe Vorlage Portal Spezial Benutzer Kategorie Wikipédia Image Discuter Modèle Mod%C3%A9le Aide Utilisateur Catégorie Cat%C3%A9gorie /; # do not crawl these: my $skip = qr/\((disambiguation|Begriffsklärung|Homonymie)\)/i; # to figure out redirections my $redir = qr/(Weitergeleitet von|Redirected from|Redirig. depuis).*?title="(.*?)"/i; # the default settings are defined in get_options() # option handling my $help_requested = 0; $help_requested = 1 if @ARGV == 0; my $opt = get_options(); # error? $help_requested = 1 if !ref($opt); # no error and --help was specified $help_requested = 2 if ref($opt) && $opt->{help} ne ''; my $copyright = "wikicrawl v$VERSION (c) by Tels 2008. " ."Released under the GPL 2.0 or later.\n\n" ."After a very cool idea by 'integral' on forum.xkcd.com. Thanx! :)\n\n"; if (ref($opt) && $opt->{version} != 0) { print $copyright; print "Running under Perl v$].\n\n"; exit 2; } if ($help_requested > 0) { print STDERR $copyright; require Pod::Usage; if ($help_requested > 1 && $Pod::Usage::VERSION < 1.35) { # The way old Pod::Usage executes "perldoc" might fail: system('perldoc', $0); exit 2; } Pod::Usage::pod2usage( { -exitval => 2, -verbose => $help_requested } ); } my $verbose = $opt->{verbose}; output ($copyright); my $graph = Graph::Easy->new(); # set some default attributes on the graph $graph->set_attribute('node','shape',$opt->{nodeshape}); $graph->set_attribute('node','font-size','80%'); $graph->set_attribute('edge','arrowstyle','filled'); $graph->set_attribute('graph','label',"Wikipedia map for $opt->{root}"); $graph->set_attribute('graph','font-size', '200%'); $graph->set_attribute('graph','comment', "Created with wikicrawl.pl v$VERSION"); output ("Using the following settings:\n"); print Data::Dumper->Dump([$opt], ['opt']); # don't crawl stuff twice my %visitedLinks; # re-use the UserAgent object my $ua = LWP::UserAgent->new(); #$ua->agent("WikiCrawl/$VERSION - " . $ua->_agent . " - vGraph::Easy $Graph::Easy::VERSION"); # count how many we have done my $nodes = 0; # enable UTF-8 output binmode STDERR, ':utf8'; binmode STDOUT, ':utf8'; # push the first node on the stack my @todo = [$opt->{root},0]; # and work on it (this will take one off and then push more nodes on it) while (@todo && crawl()) { }; my $file = "wikicrawl-$opt->{lang}.txt"; output ("Generating $file:\n"); open(my $DATA, ">", "$file") or die("Could not write to '$file': $!"); binmode ($DATA,':utf8'); print $DATA $graph->as_txt(); close $DATA; output ("All done.\n"); my $png = $file; $png =~ s/.txt/.png/; output ("Generating $png:\n"); `perl -Ilib bin/graph-easy --png --renderer=$opt->{renderer} $file`; output ("All done.\n"); ######################################################################################## # main crawl routine sub crawl { no warnings 'recursion'; # all done? return if @todo == 0; my ($name,$depth) = ($todo[0]->[0],$todo[0]->[1]); shift @todo; my $page = "http://$opt->{lang}.wikipedia.org/wiki/$name"; # limit depth return if $depth + 1 > $opt->{maxdepth}; # already did as many nodes? return if $opt->{maxnodes} > 0 && $nodes > $opt->{maxnodes}; # skip this page return 1 if exists $visitedLinks{$page}; # crawl page my $res = $ua->request(HTTP::Request->new(GET => $page)); return 1 unless $res->is_success(); # remove the " - Wikipedia" (en) or " – Wikipedia" (de) from the title my $title = decode('utf8',$res->title); # convert to UTF-8 $title =~ s/ [–-] Wikip[ée]dia.*//; return 1 if $title =~ $skip; # no disambiguation pages # tels: not sure when/why these happen: print STDERR "# $title ",$res->title()," $page\n" if $title eq ''; output ("Crawling node #$nodes '$title' at depth $depth\n"); $nodes++; # set flag $visitedLinks{$page} = undef; my $content = $res->content; # parse anchors my $parser = HTML::TokeParser->new(\$content) or die("Could not parse page."); # handle redirects: $content = decode('utf-8', $content); $content =~ $redir; my $old = $2; if ($old) { output (" Redirected to '$title' from '$old'\n"); # find the node named "$old" (at the same time adding it if it didn't exist yet) my $source = $graph->add_node($old); # and mention the redirect in the label $source->set_attribute('label', "$old\\n($title)"); # now force edges to come from that node $title = $old; } # iterate over all links for(my $i = 0; (my $token = $parser->get_tag("a")) && ($i < $opt->{maxspread} || $opt->{maxspread} == 0);) { my $url = $token->[1]{href}; my $alt = $token->[1]{title}; next unless defined $url; # we do not crawl these: next if $url !~ m/^\/wiki\//; # no pages outside of wikipedia next if $alt =~ $skip; # no disambiguation pages next if $alt =~ m/\[/; # no brackets my @chunks = split ":", substr(decode('utf-8',$url), 6); # extract special pages, if any next if exists $bad{$chunks[0]}; # no bad pages $i++; if ($title ne $alt) { output (" Adding link from '$title' to '$alt'\n", 1); my ($from,$to,$edge) = $graph->add_edge_once($title,$alt); if (defined $to) { my $old_depth = $to->raw_attribute('rank'); if (!$old_depth) { my $color = sprintf("%i", (360 / $opt->{maxdepth}) * ($depth)); $to->set_attribute('fill', 'hsl(' .$color.',1,0.7)'); # store rank $to->set_attribute('rank', $depth+1); } } } my $u = $url; $u =~ s/^\/wiki\///; push @todo, [$u,$depth+1]; } # continue return 1; } sub get_options { my $opt = {}; $opt->{help} = ''; $opt->{version} = 0; # max depth to crawl $opt->{maxdepth} = 4; # max number of links per node $opt->{maxspread} = 5; # stop after so many nodes, -1 to disable $opt->{maxnodes} = -1; # language $opt->{lang} = 'en'; # root node $opt->{root} = 'Xkcd'; $opt->{renderer} = 'neato'; $opt->{nodeshape} = 'rect'; my @o = ( "language=s" => \$opt->{lang}, "root=s" => \$opt->{root}, "maxdepth=i" => \$opt->{maxdepth}, "maxspread=i" => \$opt->{maxspread}, "maxnodes=i" => \$opt->{maxnodes}, "version" => \$opt->{version}, "help|?" => \$opt->{help}, "verbose" => \$opt->{verbose}, "nodeshape" => \$opt->{nodeshape}, ); return unless Getopt::Long::GetOptions (@o); $opt; } sub output { my ($txt, $level) = @_; $level |= 0; print STDERR $txt if $opt->{verbose} || $level == 0; } =pod =head1 NAME wikicrawl - crawl Wikipedia to generate graph from the found article links =head1 SYNOPSIS Crawl wikipedia and create a L text describing the inter-article links that were found during the crawl. At least one argument must be given to start: perl examples/wikicrawl.pl --lang=fr =head1 ARGUMENTS Here are the options: =over 12 =item --help Print the full documentation, not just this short overview. =item --version Write version info and exit. =item --language Select the language of Wikipedia that we should crawl. Currently supported are 'de', 'en' and 'fr'. Default is 'en'. =item --root Set the root node where the crawl should start. Default is of course 'Xkcd'. =item --maxdepth The maximum depth the crawl should go. Please select small values under 10. Default is 4. =item --maxspread The maximum number of links we follow per article. Please select small values under 10. Default is 5. =item --maxnodes The maximum number of nodes we crawl. Set to -1 (default) to disable. =back =head1 SEE ALSO L and L. =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the terms of the GPL. See the LICENSE file of Graph::Easy for a copy of the GPL. X =head1 AUTHOR Copyright (C) 2008 by integral L Copyright (C) 2008 by Tels L =cut