init III
This commit is contained in:
105
Perl CGI MOSE My Own Search Engine/cgi-bin/crawl.pl
Normal file
105
Perl CGI MOSE My Own Search Engine/cgi-bin/crawl.pl
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/perl
|
||||
use strict;
|
||||
use WWW::Mechanize;
|
||||
use DBI;
|
||||
|
||||
our ($M_C_DEEP, $M_C_WHITELIST, $M_C_BLACKLIST, $M_C_LINKMAXAGE);
|
||||
our ($M_DB_HOST, $M_DB_USER, $M_DB_PASS);
|
||||
|
||||
require "config.crawl.pl";
|
||||
require "config.global.pl";
|
||||
|
||||
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
|
||||
|
||||
open FH_WHITELIST, "$M_C_WHITELIST";
|
||||
my @A_WHITELIST = <FH_WHITELIST>;
|
||||
close FH_WHITELIST;
|
||||
|
||||
open FH_BLACKLIST, "$M_C_BLACKLIST";
|
||||
my @A_BLACKLIST = <FH_BLACKLIST>;
|
||||
close FH_BLACKLIST;
|
||||
|
||||
foreach my $link (@A_WHITELIST) {
|
||||
chomp $link;
|
||||
read_link($link, $M_C_DEEP) unless (in_blacklist($link));
|
||||
}
|
||||
|
||||
|
||||
sub read_link {
|
||||
my $link = shift;
|
||||
my $deep = shift;
|
||||
# rekursiv $link durchlaufen
|
||||
my @links = fetch_links($link);
|
||||
foreach my $url (@links) {
|
||||
read_url_to_db($url, $deep) unless (in_blacklist($url));
|
||||
if ($deep > 0) {
|
||||
$deep--;
|
||||
read_link($url, $deep) unless (in_blacklist($url));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub fetch_links {
|
||||
# lese alle links von einer url
|
||||
my $link = shift;
|
||||
my $mech = WWW::Mechanize->new(onerror => undef);
|
||||
$mech->get($link);
|
||||
my @ret = $mech->links();
|
||||
my @links;
|
||||
foreach my $l (@ret) {
|
||||
my $u = $l->url;
|
||||
$u = $link . $u unless ($u =~ /^http/);
|
||||
push @links,$u;
|
||||
}
|
||||
return @links;
|
||||
}
|
||||
|
||||
|
||||
sub in_blacklist {
|
||||
my $link = shift;
|
||||
my @blacklist = shift;
|
||||
my $found=0;
|
||||
foreach my $black (@A_BLACKLIST) {
|
||||
chomp $black;
|
||||
if ($link =~ /$black/) {
|
||||
# link ist in blacklist
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
# link ist nicht in blacklist\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
sub read_url_to_db {
|
||||
# lese titel, tags aus webseite und schreibe diese mit der url und der tiefein die db
|
||||
my $link = shift;
|
||||
my $deep = shift;
|
||||
my $timestamp = time();
|
||||
|
||||
my $dbh = DBI->connect('DBI:mysql:1_mose', '1_mose', '1_mose') || die "Could not connect to database: $DBI::errstr";
|
||||
# wenn $link schon in db: update
|
||||
my $sth = $dbh->prepare("SELECT count(*) FROM mose WHERE link='$link'");
|
||||
$sth->execute();
|
||||
my @result = $sth->fetchrow_array();
|
||||
$sth->finish();
|
||||
|
||||
# link deep title timestamp
|
||||
# lese titel von seite
|
||||
my $mech = WWW::Mechanize->new(onerror => undef);
|
||||
$mech->get($link);
|
||||
my $title = $mech->title(onerror => undef);
|
||||
|
||||
if ($result[0]) { # gefunden
|
||||
}
|
||||
else { # nicht gefunden
|
||||
print "'$title', '$link', '$deep', '$timestamp'\n";
|
||||
$sth = $dbh->prepare("insert into mose (title, link, deep, timestamp) values ('$title', '$link', '$deep', '$timestamp')");
|
||||
$sth->execute();
|
||||
$sth->finish();
|
||||
}
|
||||
|
||||
$dbh->disconnect();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user