#!/usr/bin/perl use strict; use WWW::Mechanize; use DBI; our ($M_C_DEEP, $M_C_WHITELIST, $M_C_BLACKLIST, $M_C_LINKMAXAGE); our ($M_DB_HOST, $M_DB_USER, $M_DB_PASS); require "config.crawl.pl"; require "config.global.pl"; $ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0; open FH_WHITELIST, "$M_C_WHITELIST"; my @A_WHITELIST = ; close FH_WHITELIST; open FH_BLACKLIST, "$M_C_BLACKLIST"; my @A_BLACKLIST = ; close FH_BLACKLIST; foreach my $link (@A_WHITELIST) { chomp $link; read_link($link, $M_C_DEEP) unless (in_blacklist($link)); } sub read_link { my $link = shift; my $deep = shift; # rekursiv $link durchlaufen my @links = fetch_links($link); foreach my $url (@links) { read_url_to_db($url, $deep) unless (in_blacklist($url)); if ($deep > 0) { $deep--; read_link($url, $deep) unless (in_blacklist($url)); } } } sub fetch_links { # lese alle links von einer url my $link = shift; my $mech = WWW::Mechanize->new(onerror => undef); $mech->get($link); my @ret = $mech->links(); my @links; foreach my $l (@ret) { my $u = $l->url; $u = $link . $u unless ($u =~ /^http/); push @links,$u; } return @links; } sub in_blacklist { my $link = shift; my @blacklist = shift; my $found=0; foreach my $black (@A_BLACKLIST) { chomp $black; if ($link =~ /$black/) { # link ist in blacklist return 1; } } # link ist nicht in blacklist\n"; return 0; } sub read_url_to_db { # lese titel, tags aus webseite und schreibe diese mit der url und der tiefein die db my $link = shift; my $deep = shift; my $timestamp = time(); my $dbh = DBI->connect('DBI:mysql:1_mose', '1_mose', '1_mose') || die "Could not connect to database: $DBI::errstr"; # wenn $link schon in db: update my $sth = $dbh->prepare("SELECT count(*) FROM mose WHERE link='$link'"); $sth->execute(); my @result = $sth->fetchrow_array(); $sth->finish(); # link deep title timestamp # lese titel von seite my $mech = WWW::Mechanize->new(onerror => undef); $mech->get($link); my $title = $mech->title(onerror => undef); if ($result[0]) { # gefunden } else { # nicht gefunden print "'$title', '$link', '$deep', '$timestamp'\n"; $sth = $dbh->prepare("insert into mose (title, link, deep, timestamp) values ('$title', '$link', '$deep', '$timestamp')"); $sth->execute(); $sth->finish(); } $dbh->disconnect(); }