Files
2024-10-14 00:08:40 +02:00

106 lines
2.3 KiB
Perl

#!/usr/bin/perl
use strict;
use WWW::Mechanize;
use DBI;
our ($M_C_DEEP, $M_C_WHITELIST, $M_C_BLACKLIST, $M_C_LINKMAXAGE);
our ($M_DB_HOST, $M_DB_USER, $M_DB_PASS);
require "config.crawl.pl";
require "config.global.pl";
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
open FH_WHITELIST, "$M_C_WHITELIST";
my @A_WHITELIST = <FH_WHITELIST>;
close FH_WHITELIST;
open FH_BLACKLIST, "$M_C_BLACKLIST";
my @A_BLACKLIST = <FH_BLACKLIST>;
close FH_BLACKLIST;
foreach my $link (@A_WHITELIST) {
chomp $link;
read_link($link, $M_C_DEEP) unless (in_blacklist($link));
}
sub read_link {
my $link = shift;
my $deep = shift;
# rekursiv $link durchlaufen
my @links = fetch_links($link);
foreach my $url (@links) {
read_url_to_db($url, $deep) unless (in_blacklist($url));
if ($deep > 0) {
$deep--;
read_link($url, $deep) unless (in_blacklist($url));
}
}
}
sub fetch_links {
# lese alle links von einer url
my $link = shift;
my $mech = WWW::Mechanize->new(onerror => undef);
$mech->get($link);
my @ret = $mech->links();
my @links;
foreach my $l (@ret) {
my $u = $l->url;
$u = $link . $u unless ($u =~ /^http/);
push @links,$u;
}
return @links;
}
sub in_blacklist {
my $link = shift;
my @blacklist = shift;
my $found=0;
foreach my $black (@A_BLACKLIST) {
chomp $black;
if ($link =~ /$black/) {
# link ist in blacklist
return 1;
}
}
# link ist nicht in blacklist\n";
return 0;
}
sub read_url_to_db {
# lese titel, tags aus webseite und schreibe diese mit der url und der tiefein die db
my $link = shift;
my $deep = shift;
my $timestamp = time();
my $dbh = DBI->connect('DBI:mysql:1_mose', '1_mose', '1_mose') || die "Could not connect to database: $DBI::errstr";
# wenn $link schon in db: update
my $sth = $dbh->prepare("SELECT count(*) FROM mose WHERE link='$link'");
$sth->execute();
my @result = $sth->fetchrow_array();
$sth->finish();
# link deep title timestamp
# lese titel von seite
my $mech = WWW::Mechanize->new(onerror => undef);
$mech->get($link);
my $title = $mech->title(onerror => undef);
if ($result[0]) { # gefunden
}
else { # nicht gefunden
print "'$title', '$link', '$deep', '$timestamp'\n";
$sth = $dbh->prepare("insert into mose (title, link, deep, timestamp) values ('$title', '$link', '$deep', '$timestamp')");
$sth->execute();
$sth->finish();
}
$dbh->disconnect();
}