init III
This commit is contained in:
1
Perl CGI MOSE My Own Search Engine/cgi-bin/blacklist
Normal file
1
Perl CGI MOSE My Own Search Engine/cgi-bin/blacklist
Normal file
@@ -0,0 +1 @@
|
||||
http://a.a.a
|
||||
@@ -0,0 +1,6 @@
|
||||
our ($M_C_DEEP, $M_C_WHITELIST, $M_C_BLACKLIST, $M_C_LINKMAXAGE);
|
||||
|
||||
$M_C_DEEP = 5;
|
||||
$M_C_WHITELIST = "whitelist";
|
||||
$M_C_BLACKLIST = "blacklist";
|
||||
$M_C_LINKMAXAGE = 60*60*24*30; # 1 Monat
|
||||
@@ -0,0 +1,6 @@
|
||||
our ($M_DB_HOST, $M_DB_USER, $M_DB_PASS);
|
||||
|
||||
$M_DB_HOST = "localhost";
|
||||
$M_DB_USER = "1_mose";
|
||||
$M_DB_PASS = "1_mose";
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
$M_BASE_URL = "http://mose.andregeissler.de";
|
||||
$M_HOME_LOGO = "mose.jpg";
|
||||
$M_HOME_LOGO_TARGET = "http://mose.andregeissler.de";
|
||||
$M_HOME_LOGO_WIDTH = "100";
|
||||
$M_HOME_LOGO_HEIGHT = "100";
|
||||
|
||||
105
Perl CGI MOSE My Own Search Engine/cgi-bin/crawl.pl
Normal file
105
Perl CGI MOSE My Own Search Engine/cgi-bin/crawl.pl
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/perl
|
||||
use strict;
|
||||
use WWW::Mechanize;
|
||||
use DBI;
|
||||
|
||||
our ($M_C_DEEP, $M_C_WHITELIST, $M_C_BLACKLIST, $M_C_LINKMAXAGE);
|
||||
our ($M_DB_HOST, $M_DB_USER, $M_DB_PASS);
|
||||
|
||||
require "config.crawl.pl";
|
||||
require "config.global.pl";
|
||||
|
||||
$ENV{PERL_LWP_SSL_VERIFY_HOSTNAME} = 0;
|
||||
|
||||
open FH_WHITELIST, "$M_C_WHITELIST";
|
||||
my @A_WHITELIST = <FH_WHITELIST>;
|
||||
close FH_WHITELIST;
|
||||
|
||||
open FH_BLACKLIST, "$M_C_BLACKLIST";
|
||||
my @A_BLACKLIST = <FH_BLACKLIST>;
|
||||
close FH_BLACKLIST;
|
||||
|
||||
foreach my $link (@A_WHITELIST) {
|
||||
chomp $link;
|
||||
read_link($link, $M_C_DEEP) unless (in_blacklist($link));
|
||||
}
|
||||
|
||||
|
||||
sub read_link {
|
||||
my $link = shift;
|
||||
my $deep = shift;
|
||||
# rekursiv $link durchlaufen
|
||||
my @links = fetch_links($link);
|
||||
foreach my $url (@links) {
|
||||
read_url_to_db($url, $deep) unless (in_blacklist($url));
|
||||
if ($deep > 0) {
|
||||
$deep--;
|
||||
read_link($url, $deep) unless (in_blacklist($url));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub fetch_links {
|
||||
# lese alle links von einer url
|
||||
my $link = shift;
|
||||
my $mech = WWW::Mechanize->new(onerror => undef);
|
||||
$mech->get($link);
|
||||
my @ret = $mech->links();
|
||||
my @links;
|
||||
foreach my $l (@ret) {
|
||||
my $u = $l->url;
|
||||
$u = $link . $u unless ($u =~ /^http/);
|
||||
push @links,$u;
|
||||
}
|
||||
return @links;
|
||||
}
|
||||
|
||||
|
||||
sub in_blacklist {
|
||||
my $link = shift;
|
||||
my @blacklist = shift;
|
||||
my $found=0;
|
||||
foreach my $black (@A_BLACKLIST) {
|
||||
chomp $black;
|
||||
if ($link =~ /$black/) {
|
||||
# link ist in blacklist
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
# link ist nicht in blacklist\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
sub read_url_to_db {
|
||||
# lese titel, tags aus webseite und schreibe diese mit der url und der tiefein die db
|
||||
my $link = shift;
|
||||
my $deep = shift;
|
||||
my $timestamp = time();
|
||||
|
||||
my $dbh = DBI->connect('DBI:mysql:1_mose', '1_mose', '1_mose') || die "Could not connect to database: $DBI::errstr";
|
||||
# wenn $link schon in db: update
|
||||
my $sth = $dbh->prepare("SELECT count(*) FROM mose WHERE link='$link'");
|
||||
$sth->execute();
|
||||
my @result = $sth->fetchrow_array();
|
||||
$sth->finish();
|
||||
|
||||
# link deep title timestamp
|
||||
# lese titel von seite
|
||||
my $mech = WWW::Mechanize->new(onerror => undef);
|
||||
$mech->get($link);
|
||||
my $title = $mech->title(onerror => undef);
|
||||
|
||||
if ($result[0]) { # gefunden
|
||||
}
|
||||
else { # nicht gefunden
|
||||
print "'$title', '$link', '$deep', '$timestamp'\n";
|
||||
$sth = $dbh->prepare("insert into mose (title, link, deep, timestamp) values ('$title', '$link', '$deep', '$timestamp')");
|
||||
$sth->execute();
|
||||
$sth->finish();
|
||||
}
|
||||
|
||||
$dbh->disconnect();
|
||||
}
|
||||
|
||||
42
Perl CGI MOSE My Own Search Engine/cgi-bin/index.pl
Normal file
42
Perl CGI MOSE My Own Search Engine/cgi-bin/index.pl
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use CGI;
|
||||
use CGI::Carp qw(fatalsToBrowser);
|
||||
|
||||
require "config.search.pl";
|
||||
|
||||
my $cgi = new CGI;
|
||||
print $cgi->header();
|
||||
|
||||
print <<START;
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
START
|
||||
|
||||
print <<BODY;
|
||||
<div align="center">
|
||||
<p>
|
||||
<a href="$M_HOME_LOGO_TARGET"><img width="$M_HOME_LOGO_WIDTH" height="$M_HOME_LOGO_HEIGHT" src="$M_BASE_URL/$M_HOME_LOGO"></a><br>
|
||||
</p>
|
||||
<p>
|
||||
<form action="/cgi-bin/search.pl" method="post">
|
||||
<input type="hidden" name="mose" value="mose">
|
||||
<input type="input" size="50" name="search"><br>
|
||||
<input type="submit" value="Suche">
|
||||
</form>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div style="position: relative">
|
||||
<p style="position: fixed; bottom: 0; width:100%; text-align: center">
|
||||
<font size="-2">copyright by andre geissler 2013</font>
|
||||
</p>
|
||||
</div>
|
||||
BODY
|
||||
|
||||
print <<FOOTER;
|
||||
</body>
|
||||
</html>
|
||||
FOOTER
|
||||
0
Perl CGI MOSE My Own Search Engine/cgi-bin/lib.pl
Normal file
0
Perl CGI MOSE My Own Search Engine/cgi-bin/lib.pl
Normal file
30
Perl CGI MOSE My Own Search Engine/cgi-bin/search.pl
Normal file
30
Perl CGI MOSE My Own Search Engine/cgi-bin/search.pl
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use CGI;
|
||||
use CGI::Carp qw(fatalsToBrowser);
|
||||
|
||||
my $cgi = new CGI;
|
||||
|
||||
print $cgi->header();
|
||||
|
||||
print <<START;
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
START
|
||||
|
||||
|
||||
print <<BODY;
|
||||
|
||||
|
||||
...
|
||||
|
||||
|
||||
BODY
|
||||
|
||||
|
||||
print <<FOOTER;
|
||||
</body>
|
||||
</html>
|
||||
FOOTER
|
||||
1
Perl CGI MOSE My Own Search Engine/cgi-bin/whitelist
Normal file
1
Perl CGI MOSE My Own Search Engine/cgi-bin/whitelist
Normal file
@@ -0,0 +1 @@
|
||||
http://www.bimminger.at
|
||||
15
Perl CGI MOSE My Own Search Engine/htdocs/index.html
Normal file
15
Perl CGI MOSE My Own Search Engine/htdocs/index.html
Normal file
@@ -0,0 +1,15 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>MOSE</title>
|
||||
</head>
|
||||
<frameset>
|
||||
<frame src="/cgi-bin/index.pl" name="MOSE">
|
||||
<noframes>
|
||||
<body>
|
||||
<h1>MOSE meldet</h1>
|
||||
<p>Der verwendete Browser ist zu alt. Ein bisschen neuer darf es dann schon sein.</p>
|
||||
</body>
|
||||
</noframes>
|
||||
</frameset>
|
||||
</html>
|
||||
BIN
Perl CGI MOSE My Own Search Engine/htdocs/mose.jpg
Normal file
BIN
Perl CGI MOSE My Own Search Engine/htdocs/mose.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.9 KiB |
Reference in New Issue
Block a user