devops/perl/lib/Graph-Easy-0.76/examples/wikicrawl.pl

#!/usr/bin/perl

use strict;
use Graph::Easy;
use LWP;
use HTML::TokeParser;
use utf8;
use Getopt::Long;
use Encode;
use Data::Dumper;

my $VERSION = 0.03;

# things that shouldn't be looked at
my %bad = map { $_ => 1 } qw/
  Wikipedia Image Talk Help Template Portal Special User Category
  Wikipedia Bild Diskussion Hilfe Vorlage Portal Spezial Benutzer Kategorie
  Wikipédia Image Discuter Modèle Mod%C3%A9le Aide Utilisateur Catégorie Cat%C3%A9gorie
  /;
# do not crawl these:
my $skip = qr/\((disambiguation|Begriffsklärung|Homonymie)\)/i;
# to figure out redirections
my $redir = qr/(Weitergeleitet von|Redirected from|Redirig. depuis).*?title="(.*?)"/i;

# the default settings are defined in get_options()
# option handling
my $help_requested = 0; $help_requested = 1 if @ARGV == 0;

my $opt = get_options();

# error?
$help_requested = 1 if !ref($opt);

# no error and --help was specified
$help_requested = 2 if ref($opt) && $opt->{help} ne '';

my $copyright = "wikicrawl v$VERSION  (c) by Tels 2008.  "
        	."Released under the GPL 2.0 or later.\n\n"
        	."After a very cool idea by 'integral' on forum.xkcd.com. Thanx! :)\n\n";

if (ref($opt) && $opt->{version} != 0)
  {
  print $copyright;
  print "Running under Perl v$].\n\n";
  exit 2;
  }

if ($help_requested > 0)
  {
  print STDERR $copyright;
  require Pod::Usage;
  if ($help_requested > 1 && $Pod::Usage::VERSION < 1.35)
    {
    # The way old Pod::Usage executes "perldoc" might fail:
    system('perldoc', $0);
    exit 2;
    }
  Pod::Usage::pod2usage( { -exitval => 2, -verbose => $help_requested } );
  }

my $verbose = $opt->{verbose};

output ($copyright);

my $graph = Graph::Easy->new();
# set some default attributes on the graph
$graph->set_attribute('node','shape',$opt->{nodeshape});
$graph->set_attribute('node','font-size','80%');
$graph->set_attribute('edge','arrowstyle','filled');
$graph->set_attribute('graph','label',"Wikipedia map for $opt->{root}");
$graph->set_attribute('graph','font-size', '200%');
$graph->set_attribute('graph','comment', "Created with wikicrawl.pl v$VERSION");

output ("Using the following settings:\n");
print Data::Dumper->Dump([$opt], ['opt']);

# don't crawl stuff twice
my %visitedLinks;
# re-use the UserAgent object
my $ua = LWP::UserAgent->new();
#$ua->agent("WikiCrawl/$VERSION - " . $ua->_agent . " - vGraph::Easy $Graph::Easy::VERSION");

# count how many we have done
my $nodes = 0;

# enable UTF-8 output
binmode STDERR, ':utf8';
binmode STDOUT, ':utf8';

# push the first node on the stack
my @todo = [$opt->{root},0];
# and work on it (this will take one off and then push more nodes on it)
while (@todo && crawl()) { };

my $file = "wikicrawl-$opt->{lang}.txt";
output ("Generating $file:\n");
open(my $DATA, ">", "$file") or die("Could not write to '$file': $!");
binmode ($DATA,':utf8');
print $DATA $graph->as_txt();
close $DATA;
output ("All done.\n");

my $png = $file; $png =~ s/.txt/.png/;

output ("Generating $png:\n");
`perl -Ilib bin/graph-easy --png --renderer=$opt->{renderer} $file`;

output ("All done.\n");

########################################################################################

# main crawl routine
sub crawl {
  no warnings 'recursion';

  # all done?
  return if @todo == 0;
  my ($name,$depth) = ($todo[0]->[0],$todo[0]->[1]);
  shift @todo;

  my $page = "http://$opt->{lang}.wikipedia.org/wiki/$name";

  # limit depth
  return if $depth + 1 > $opt->{maxdepth};
  # already did as many nodes?
  return if $opt->{maxnodes} > 0 && $nodes > $opt->{maxnodes};
  # skip this page
  return 1 if exists $visitedLinks{$page};

  # crawl page
  my $res = $ua->request(HTTP::Request->new(GET => $page));
  return 1 unless $res->is_success();

  # remove the " - Wikipedia" (en) or " – Wikipedia" (de) from the title
  my $title = decode('utf8',$res->title);	# convert to UTF-8
  $title =~ s/ [–-] Wikip[ée]dia.*//;
  return 1 if $title =~ $skip;			# no disambiguation pages

  # tels: not sure when/why these happen:
  print STDERR "# $title ",$res->title()," $page\n" if $title eq '';

  output ("Crawling node #$nodes '$title' at depth $depth\n"); $nodes++;

  # set flag
  $visitedLinks{$page} = undef;
  my $content = $res->content;

  # parse anchors
  my $parser = HTML::TokeParser->new(\$content) or die("Could not parse page.");

  # handle redirects:
  $content = decode('utf-8', $content);
  $content =~ $redir; my $old = $2;

  if ($old)
    {
    output (" Redirected to '$title' from '$old'\n");
    # find the node named "$old" (at the same time adding it if it didn't exist yet)
    my $source = $graph->add_node($old);
    # and mention the redirect in the label
    $source->set_attribute('label', "$old\\n($title)");
    # now force edges to come from that node
    $title = $old;
    }

  # iterate over all links
  for(my $i = 0; (my $token = $parser->get_tag("a")) && ($i < $opt->{maxspread} || $opt->{maxspread} == 0);)
    {
    my $url = $token->[1]{href};
    my $alt = $token->[1]{title};

    next unless defined $url;
    # we do not crawl these:
    next if $url !~ m/^\/wiki\//;	 	# no pages outside of wikipedia
    next if $alt =~ $skip;			# no disambiguation pages
    next if $alt =~ m/\[/;			# no brackets

    my @chunks = split ":", substr(decode('utf-8',$url), 6);	# extract special pages, if any
    next if exists $bad{$chunks[0]};		# no bad pages

    $i++;
    if ($title ne $alt)
      {
      output (" Adding link from '$title' to '$alt'\n", 1);
      my ($from,$to,$edge) = $graph->add_edge_once($title,$alt);
      if (defined $to)
	{
	my $old_depth = $to->raw_attribute('rank');
        if (!$old_depth)
	  {
	  my $color = sprintf("%i", (360 / $opt->{maxdepth}) * ($depth));
	  $to->set_attribute('fill', 'hsl(' .$color.',1,0.7)');
	  # store rank
	  $to->set_attribute('rank', $depth+1);
          }
	}
      }
    my $u = $url; $u =~ s/^\/wiki\///;
    push @todo, [$u,$depth+1];
    }

  # continue
  return 1;
  }

sub get_options
  {
  my $opt = {};
  $opt->{help} = '';
  $opt->{version} = 0;
  # max depth to crawl
  $opt->{maxdepth} = 4;
  # max number of links per node
  $opt->{maxspread} = 5;
  # stop after so many nodes, -1 to disable
  $opt->{maxnodes} = -1;
  # language
  $opt->{lang} = 'en';
  # root node
  $opt->{root} = 'Xkcd';
  $opt->{renderer} = 'neato';
  $opt->{nodeshape} = 'rect';
  my @o = (
    "language=s" => \$opt->{lang},
    "root=s" => \$opt->{root},
    "maxdepth=i" => \$opt->{maxdepth},
    "maxspread=i" => \$opt->{maxspread},
    "maxnodes=i" => \$opt->{maxnodes},
    "version" => \$opt->{version},
    "help|?" => \$opt->{help},
    "verbose" => \$opt->{verbose},
    "nodeshape" => \$opt->{nodeshape},
    );
  return unless Getopt::Long::GetOptions (@o);
  $opt;
  }

sub output
  {
  my ($txt, $level) = @_;

  $level |= 0;

  print STDERR $txt if $opt->{verbose} || $level == 0;
  }

=pod

=head1 NAME

wikicrawl - crawl Wikipedia to generate graph from the found article links

=head1 SYNOPSIS

Crawl wikipedia and create a L<Graph::Easy> text describing the inter-article links
that were found during the crawl.

At least one argument must be given to start:

	perl examples/wikicrawl.pl --lang=fr

=head1 ARGUMENTS

Here are the options:

=over 12

=item --help

Print the full documentation, not just this short overview.

=item --version

Write version info and exit.

=item --language

Select the language of Wikipedia that we should crawl. Currently supported
are 'de', 'en' and 'fr'. Default is 'en'.

=item --root

Set the root node where the crawl should start. Default is of course 'Xkcd'.

=item --maxdepth

The maximum depth the crawl should go. Please select small values under 10. Default is 4.

=item --maxspread

The maximum number of links we follow per article. Please select small values under 10. Default is 5.

=item --maxnodes

The maximum number of nodes we crawl. Set to -1 (default) to disable.

=back

=head1 SEE ALSO

L<http://forums.xkcd.com/viewtopic.php?f=2&t=21300&p=672184> and
L<Graph::Easy>.

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the terms of the GPL.

See the LICENSE file of Graph::Easy for a copy of the GPL.

X<license>

=head1 AUTHOR

Copyright (C) 2008 by integral L<forum.xkcd.com>
Copyright (C) 2008 by Tels L<http://bloodgate.com>

=cut