#!/usr/bin/perl require 5; use strict; =head1 copyright Fluid Dynamics Search Engine, Version 2.0.0.0030 Copyright 1997-2000 by Zoltan Milosevic. Please adhere to the copyright notice and conditions of use, described in the attached help file and hosted at the URL below. For the latest version and help files, visit: http://www.xav.com/scripts/search/ This search engine is managed from the web, and it comes with a password to keep it secure. You can set the password when you first visit this script using the special "Mode=Admin" query string - for example: http://my.host.com/search.pl?Mode=Admin =cut use vars qw( $VERSION %FORM ); $VERSION = '2.0.0.0030'; my $all_code = <<'END_OF_FILE'; use vars qw( $realms %const %Rules @MonthNames $global_lockfile_count @lang_strings ); $ENV{'SCRIPT_NAME'} = '' unless $ENV{'SCRIPT_NAME'};#avoid uninit errs %const = ( 'help_file' => 'http://www.xav.com/scripts/search/admin_help.html', 'script_start_time' => time(), 'script_name' => $ENV{'SCRIPT_NAME'}, 'admin_url' => $ENV{'SCRIPT_NAME'} . '?Mode=Admin', 'form_password' => '', 'request_method' => 'POST', 'log_file' => 'search.log.txt', 'pending_file' => 'search.pending.txt', 'file_mask' => 0766, ); @MonthNames = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'); my $err_msg = ''; Err: { # Give the folder where all data files are located: $err_msg = load_files( 'searchdata' ); next Err if ($err_msg); &ReadInput(\%FORM); if ($FORM{'NextLink'}) { # security re-director from admin screen (prevents query-string-based # password from showing up in referer logs of remote systems: print "Content-Type: text/html\015\012\015\012"; print "$FORM{'NextLink'}"; last Err; } if (($FORM{'Mode'} eq 'AnonAdd') or ($FORM{'AddSite'})) { print "Content-Type: text/html\015\012\015\012"; &PrintTemplate( 0, 'header.htm', $Rules{'language'} ); #changed 0010 - reverse compatibility if (defined($FORM{'AddSite'})) {$FORM{'URL'} = $FORM{'AddSite'};} if ((defined($FORM{'Realm'})) and (defined($FORM{'URL'}))) { &AddURL(0, 1, $FORM{'Realm'}, $FORM{'URL'}); } &PrintAddRemoteSiteForm('Add Your Own Website', '', $FORM{'Realm'}, 1); &PrintFooter($Rules{'allowanonadd'}, 1); last Err; } if (($FORM{'Mode'} ne 'Admin') and (not $FORM{'Terms'})) { print "Content-Type: text/html\015\012\015\012"; &PrintTemplate( 0, 'header.htm', $Rules{'language'} ); &SearchForm; PrintTemplate( 0, 'tips.htm', $Rules{'language'} ); &PrintFooter($Rules{'allowanonadd'}, 1); last Err; } if ($FORM{'Mode'} ne 'Admin') { print "Content-Type: text/html\015\012\015\012"; # Anonymous search engine code: # Idea: add all non-forbidden terms as a string and add that as a phrase # with highest priority. Multiply hit relevance by length of string to # give longer search terms and phrases more weight. # This controls the display, so these extra terms aren't shown to the user: my $Rank = $FORM{'Rank'} ? $FORM{'Rank'} : 1; my ($bTermsExist, $Ignored_Terms, $Important_Terms, $DocSearch, $RealmSearch, $where_clause, @SearchTerms) = &parse_search_terms($FORM{'Terms'}, $FORM{'Match'}); my $Realm = $FORM{'Realm'} ? $FORM{'Realm'} : 'All'; my $NumPagesSearched = 0; my @HITS = (); Search: { unless ($bTermsExist) { print "\n"; next Search; } if ($Rules{'sql: enable'}) { if ($Realm ne 'All') { my ($err_msg, $p_realm_data) = $realms->hashref( $Realm ); unless ($err_msg) { $where_clause .= " AND realm_id = $$p_realm_data{'realm_id'}"; $NumPagesSearched = $$p_realm_data{'pagecount'}; } } else { my $p_realm_data = (); foreach $p_realm_data ($realms->list('all')) { $NumPagesSearched += $$p_realm_data{'pagecount'}; } } my $foo = 0; &SearchDatabase($where_clause, $DocSearch, \$foo, \@HITS); next Search; } # Search terms have been formatted. Now search the database(s): # each sub populates @HITS as needed. # If Realm is specific, search it - otherwise search all: if ($Realm ne 'All') { my ($err_msg, $p_realm_data) = $realms->hashref( $Realm ); if ($p_realm_data) { if ($$p_realm_data{'is_runtime'}) { &SearchRunTime($Realm, $DocSearch, \$NumPagesSearched, \@HITS); } else { &SearchIndexFile($$p_realm_data{'file'}, $RealmSearch, \$NumPagesSearched, \@HITS); } } } else { my $RH; my ($pages_searched, @hits) = (0); foreach $RH ($realms->list('has_file')) { &SearchIndexFile($$RH{'file'}, $RealmSearch, \$NumPagesSearched, \@HITS); } foreach $RH ($realms->list('is_runtime')) { &SearchRunTime($$RH{'name'}, $DocSearch, \$NumPagesSearched, \@HITS); } } } my ($HitCount, $PerPage, $Next, $summary) = (scalar @HITS, $Rules{'hits per page'}); if (($FORM{'maxhits'} =~ m!^(\d+)$!) and ($FORM{'maxhits'} > 0)) { $PerPage = $1; } my $Remaining = $HitCount - $Rank - $PerPage + 1; my $RangeUpper = $Rank + $PerPage - 1; if ($Remaining >= $PerPage) { $Next = $PerPage; } elsif ($Remaining > 0) { $Next = $Remaining; } else { $RangeUpper = $HitCount; } &PrintTemplate( 0, 'header.htm', $Rules{'language'} ); print SelectAd(1, @SearchTerms); print '' . $lang_strings[8] . '
'; if ($Ignored_Terms) { printf( $lang_strings[9], html_encode($Ignored_Terms)); } if ($HitCount) { printf( $lang_strings[10], html_encode($Important_Terms), $NumPagesSearched); } else { printf( $lang_strings[11], html_encode($Important_Terms), $NumPagesSearched); } print '
'; print SelectAd(2, @SearchTerms); PrintHits: { if ($HitCount < 1) { print '
', $lang_strings[17], '
'; last PrintHits; } printf( $lang_strings[12], $Rank, $RangeUpper, $HitCount ); print "
\n"; my $i = $Rank; foreach ((reverse sort @HITS)[($Rank-1)..($RangeUpper-1)]) { next unless (m!^(\d+)\.(\d+) u= (.+) t= (.*?) d= (.*?) c= (.*?)$!); my ($relevance, $URL, $Title, $Description, $context) = ($1, $3, $4, $5, $6); my ($DD, $MM, $YYYY, $FBYTES) = (unpack('A2A2A2A4A*', $2))[1..4]; print &StandardVersion( \@SearchTerms, 'redirector' => $Rules{'redirector'}, 'rank' => $i, 'url' => $URL, 'title' => $Title, 'description' => $Description, 'size' => $FBYTES, 'dd' => $DD, 'mm' => $MM, 'yyyy' => $YYYY, 'context' => $context, ); $i++; } printf( $lang_strings[13], $Rank, $RangeUpper, $HitCount ); if ($HitCount > $PerPage) { print '

'; print $lang_strings[14]; print ' '; my ($url_realm, $url_terms) = (url_encode($FORM{'Realm'}), url_encode($FORM{'Terms'})); my $linkhits = "$const{'script_name'}?Realm=$url_realm&Match=$FORM{'Match'}&Terms=$url_terms&Rank="; if ($Rank > 1) { print "[ << $lang_strings[15] ] "; } my $nlinks = 1 + int(($HitCount - 1) / $PerPage); my $thislink = 1 + int(($Rank - 1)/ $PerPage); my $start = 1; if ($thislink > 15) { $start = $thislink - 15; } my $x = 0; for ($x = $start; $x <= $nlinks; $x++) { if ($x == $thislink) { print " $x"; } else { print " $x\n"; } #changed 0015 - fixed offset-by-2 error last if ($x > ($start + 18)); #end changes } if ($Remaining > 0) { print " [ $lang_strings[16] >> ]"; } print "

\n"; } print "
\n"; #end changes } print SelectAd(3, @SearchTerms); &SearchForm; print SelectAd(4, @SearchTerms); &PrintFooter($Rules{'allowanonadd'}, 1); &log_search( $Realm, $FORM{'Terms'}, $Rank, $HitCount, $NumPagesSearched ); last Err; } if ($FORM{'Mode'} eq 'Admin') { my ($is_auth, $form_password, $url_password) = Authenticate($Rules{'password'}); last Err unless ($is_auth); $const{'form_password'} = $form_password; $const{'admin_url'} .= $url_password; print "Content-Type: text/html\015\012\015\012"; my %admin_replace_values = %const; &PrintTemplate( 0, 'admin_header.txt', $Rules{'language'}, \%admin_replace_values ); if ($FORM{'Action'} =~ m!^Add\s?URL$!) { # allow for single URL, this will need to be cleaned up. my @addresses_to_index = (); if (defined($FORM{'URL'})) { push(@addresses_to_index, $FORM{'URL'}); } else { while (defined($_ = each %FORM)) { next unless (m!^(A|AddLink)\d+$!); push(@addresses_to_index, $FORM{$_}); } } if (($FORM{'EntireSite'}) and ('1' eq $FORM{'EntireSite'})) { $FORM{'StartTime'} = $const{'script_start_time'} - 15; my $LimitSite = $FORM{'URL'}; # turns http://io.com to http://io.com/ $LimitSite .= '/' if ($LimitSite =~ m!^http://([^\/]+)$!i); # turns http://www.io.com/~bob to http://www.io.com/~bob/ $LimitSite .= '/' if ($LimitSite =~ m!/([^\/\.]+)$!i); # turns http://io.com/index.html to http://io.com/ $LimitSite = $1 if ($LimitSite =~ m!^(.*?)(\w+)\.(\w+)$!); $FORM{'Action'} = 'CrawlEntireSite'; $FORM{'LimitSite'} = $LimitSite; } &AddURL(0, 0, $FORM{'Realm'}, @addresses_to_index); } elsif ($FORM{'Action'} eq 'Build') { my $StartFile = 0; if (($FORM{'StartFile'}) and ($FORM{'StartFile'} =~ m!^\d+$!)) { $StartFile = $FORM{'StartFile'}; } &BuildIndex($FORM{'Realm'}, $StartFile); } elsif ($FORM{'Action'} eq 'Review') { &ReviewIndex($FORM{'Realm'}, $FORM{'Start'}, $Rules{'crawler: max pages per batch'}); } elsif ($FORM{'Action'} eq 'ReCrawlRealm') { unless ($FORM{'StartTime'}) { $FORM{'StartTime'} = $const{'script_start_time'} - 5; } &ReCrawlRealm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'CrawlEntireSite') { &CrawlEntireSite($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'MaintainRealm') { &MaintainRealm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'ViewLog') { &ViewLog(); } elsif ($FORM{'Action'} eq 'Edit') { &PrintEditRecordForm($FORM{'Realm'}, $FORM{'URL'}); } elsif ($FORM{'Action'} eq 'SaveEditedRecord') { &SaveEditedRecord($FORM{'Realm'}, $FORM{'EditURL'}, $FORM{'URL'}, $FORM{'FBYTES'}, $FORM{'Title'}, $FORM{'Description'}, $FORM{'keywords'}, $FORM{'promote'}); } elsif ($FORM{'Action'} eq 'DeleteRecord') { #changed 0022 - allow multiple delete, retain reverse compat my @urls_to_delete = (); while (defined($_ = each %FORM)) { next unless (m!^URL\d*$!); push(@urls_to_delete, $FORM{$_}); } &DeleteRecord($FORM{'Realm'}, $FORM{'query_pattern'}, @urls_to_delete); } elsif ($FORM{'Action'} eq 'FilterRules') { &FilterRules(); } elsif ($FORM{'Action'} eq 'GeneralRules') { &GeneralRules('GeneralRules', 1, 1); } elsif ($FORM{'Action'} eq 'manage_data_storage') { &manage_data_storage(); } elsif ($FORM{'Action'} eq 'CreateRealmForm') { &CreateRealmForm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'CreateRealm') { &CreateRealm($FORM{'Realm'}, $FORM{'File'}, $FORM{'BaseDir'}, $FORM{'BaseURL'}); } elsif ($FORM{'Action'} eq 'DeleteRealm') { &DeleteRealm($FORM{'Realm'}); } elsif ($FORM{'Action'} eq 'AdPage') { &AdPage($const{'request_method'}, $const{'script_name'}, $const{'form_password'}, %FORM); } elsif ($FORM{'Action'} eq 'AddForbidSite') { my $fr = new fdse_filter_rules; my $p_data; foreach $p_data ($fr->list) { next unless ($$p_data{'name'} eq 'Forbid Sites'); my $p_strings = $$p_data{'p_strings'}; push(@$p_strings, $FORM{'URL'}); last; } my ($err_msg) = $fr->write(); if ($err_msg) { printf( $lang_strings[0], "could not save 'Forbid Sites' filter rule - $err_msg"); } else { printf( $lang_strings[2], "URL '$FORM{'URL'}' is now forbidden (added to the 'Forbid Sites' filter rule)"); } } else { &HTML_UI(); } &PrintTemplate( 0, 'admin_footer.txt', $Rules{'language'}, \%admin_replace_values ); last Err; } last Err; } continue { print "Content-Type: text/html\015\012\015\012"; print "

Error: $err_msg.

\n"; } =item load_files Usage: my $err_msg = load_files($data_files_dir); This function attempts to load all the script-specific data from files: changes directory to $data_files_dir parses settings.txt and puts the contents into the global %Rules hash. initializes $realms object require's all modules Failures with any of these actions are considered fatal errors, and the return values are set appropriately. Dependencies: &LoadRules %Rules ... =cut sub load_files { my ($data_files_dir) = @_; my $err_msg = ''; Err: { # This manually sets the current working directory to the directory that # contains this script. This is necessary in case people have used a # relative path to the $data_files_dir: if ($0 =~ m!^(.*)(\\|/)!) { chdir($1); push(@INC, "$1/searchmods", './searchmods', '../searchmods'); } else { push(@INC, './searchmods', '../searchmods'); } unless (-e 'searchmods') { $err_msg = "unable to access directory 'searchmods'"; next Err; } foreach ('common.pl', 'fdse_realms.pl', 'search_ads.pl', 'filter_rules.pl', 'crawler.pl') { require $_; } unless (chdir($data_files_dir)) { $err_msg = "unable to chdir to data files directory '$data_files_dir' - $!"; next Err; } # Can we load the rules? ($err_msg, %Rules) = &LoadRules(); next Err if ($err_msg); my $lang_strings_file = 'templates/' . $Rules{'language'} . '/strings.txt'; my ($obj, $p_rhandle) = (); $obj = new LockFile; ($err_msg, $p_rhandle) = $obj->Read( $lang_strings_file ); next Err if ($err_msg); @lang_strings = (); while ($_ = readline($$p_rhandle)) { chomp($_); push(@lang_strings, $_); } $err_msg = $obj->Close(); next Err if ($err_msg); $realms = new fdse_realms; $realms->use_database( $Rules{'sql: enable'} ); $realms->load(); last Err; } return $err_msg; } END_OF_FILE undef($@); eval $all_code; if ($@) { my $errstr = $@; print "Content-Type: text/html\015\012\015\012"; print "

Perl Execution Error in $0:

$@
"; $errstr =~ s!\"!\"!g; $errstr =~ s!\!\>!g; print <<"EOM";
EOM my ($name, $value) = (); while (($name, $value) = each %FORM) { print "\n"; } print <<"EOM";

Please report this error to the script author:


EOM }