<!-- SophiaKnows -->

PERL SEARCH INDEXER
Rev 2: December 2004

#!/usr/bin/perl

# --------------------------------- #
#           SOPHIAKNOWS             #
# --------------------------------- #
#         LIBRARY INDEXER           #
# --------------------------------- #
#  Created:  2000-12-02             #
#  Issued:   2000-12-02             #
#  Modified: 2004-10-15             #
#  Copyright (c) 2004               #
#  A.R. Pisarra, SophiaKnows        #
# --------------------------------- #
# --------------------------------- #

# A. SUB INDEX_FILE

sub Index_File {

# 1. READ BUFFERED ARGS

   $wordlog="$_[0]";
   $targetfile="$_[1]";
   $targetid="$_[2]";

# 2. INITIALIZE/CLEAR ARRAYS

   my %logentries;                                  # hash of stored word entries
   my %wordoccurence;                               # hash of word occurence in file
   my @allwords;                                    # all unique words in index
   my @newwords;                                    # all unique words in file

# 3. READ EXISTING INDEX INTO HASH

   open(WORDLOG,"$wordlog");
   @storedrows=<WORDLOG>;
   close(WORDLOG);
   foreach $storedrow (@storedrows) {
      chomp($storedrow);
      ($storedword,$storedoccurences) = split(/\"\,\"/,$storedrow);
      $logentries{$storedword} = $storedoccurences;
      push (@allwords,$storedword);
      }

# 4. READ/JOIN FILE BODY INTO VARIABLE

   my $filebody = JoinFilebody($targetfile);

# 6. READ BODY INTO WORD ARRAY

   my @words = ($filebody =~ /\w+/g);

# 7. GET UNIQUE WORD AND COUNT OCCURANCES

   $lastword="";                                    # clear repeat check
   foreach $word (sort @words) {                    # sort new word list
      unless($word eq $lastword) {                  # check for repeated words
         push (@newwords,$word);                    # add nonrepeated words to list
         }
      $wordoccurence{$word}++;                      # increment occurence count         
      $lastword=$word;                              # reset repeat check
      }
      
# 8. INITIALIZE/APPEND WORD HISTORIES

   foreach $newword (@newwords) {
      unless($logentries{$newword}) {               # unless word in storded list
         push (@allwords,$newword);                 # add word to storded list
         }
      $logentries{$newword} .= "\,$targetid=$wordoccurence{$freshword}";
      }

# 9. STORE UPDATED INDEX TO LOG FILE
   
   open(WORDLOG,">$wordlog");
   foreach $allword (sort @allwords) {
      $logentries{$allword}=~s/^\,//;               # strip any initial commas
      print WORDLOG "$allword\"\,\"$logentries{$allword}\n";
      }
   close(WORDLOG);
   }

# B. SUB JOIN FILEBODY
sub JoinFilebody {
   $filecontents="";   
   open (TARGETFILE,"$_[0]");                        # open/read file
   @lines=<TARGETFILE>;
   close(TARGETFILE);
   foreach $line(@lines){
      chomp ($line);                                 # strip line ends
      $filecontents.=$line." ";                      # concatenate lines
      }
   $filecontents =~ tr/A-Z/a-z/;                     # lowercase all
   $filecontents =~ s/<!--.+?-->//g;                 # strip comments
   $filecontents =~ s/<script[^>]*>.+?<\/script>//g; # strip scripts
   $filecontents =~ s/<style[^>]*>.+?<\/style>//g;   # strip styles
   $filecontents =~ s/<[^>]+>//g;                    # strip html
   return $filecontents;
   }

1;

#!/usr/bin/perl

# --------------------------------- #
#       LIBRARY SEARCH BASIC        #
# --------------------------------- #
#  Created:  2000-12-02             #
#  Issued:   2000-12-02             #
#  Modified: 2004-10-15             #
#  Copyright (c) 2004               #
#  A.R. Pisarra, SophiaKnows        #
# --------------------------------- #
# --------------------------------- #

# This basic version of the library 
# search: 
#
# (1) Performs a quasi boolean  
# search of the library index for 
# files containing 1 or more instances
# of all words passed to the search
# function; and
#
# (2) Prints a relevancy ranked 
# list of the files matching the 
# searched criteria


# DIRECTORIES/FILES
$the_dir="/data/";
$wordindex=$the_dir."index.txt";
$pagekey="$the_dir."pages.txt";

# INCLUDE/INVOKE PARSEFORM
require "parseform.pl";
&Parse_Form;

# SEARCH WORD(S)
$searchwords=$formdata{'restrictions'};
$searchwords=~tr/A-Z/a-z/;
@searchwords=split(" ",$searchwords);

# INITIALIZE SEARCH INDEX
&InitWordlog($wordindex);

# INITIALIZE PAGE KEY
&InitPageKey($pagekey);

# FIND MATCHED ENTRIES
foreach $searchword (@searchwords) {
      if($wordhistories{$searchword}) {
         @wordhits=split(/\,/,$wordhistories{$searchword});
         foreach $pagehit (@wordhits) {
            ($pid,$hits)=split(/=/,$pagehit);
            $pagematches{$pid}++;
            $matches{$pid}+=$hits;
            }
         }
      }

# PRINT RESULTS
print "Content-type: text/html\n\n";
print "<p>PAGE HITS IN DESCENDING RANKED ORDER:</p>\n";
foreach $key (sort HashByDescendingValues (keys(%matches))) {
   if($pagematches{$key}>$#searchwords) {
      print "<br />$matches{$key} HITS \@ ";
      print "<a href=$pagefiles{$key}>$pagetitles{$key}</a>\n";
      }
   } 

# SUBROUTINES:

# SUB: InitWordlog
sub InitWordlog {
   open(WORDLIST,"$_[0]");
   @lines=<WORDLIST>;
   close(WORDLIST);
   foreach $line (@lines) {
      chomp($line);
      ($storedword,$history)=split(/\",\"/,$line);
      $wordhistories{$storedword}=$history;
      }
   }

# SUB: InitPageKey
sub InitPageKey {
   open(PAGEKEY,"$pagekey");
   @pages=<PAGEKEY>;
   close(PAGEKEY);   
   foreach $page (@pages) {
      chomp($page);
      ($pageid,$pagefile,$pagetitle)=split(/\t/,$page);
      $pagefiles{$pageid}=$pagefile;
      $pagetitles{$pageid}=$pagetitle;
      }
   }

# SUB: HashByDescendingValues
sub HashByDescendingValues {
   $matches{$b} <=> $matches{$a};
   }

1;


#!/usr/bin/perl

# --------------------------------- #
#           SOPHIAKNOWS             #
# --------------------------------- #
#    LIBRARY SEARCH ADVANCED        #
# --------------------------------- #
#  Created:  2000-12-02             #
#  Issued:   2000-12-02             #
#  Modified: 2004-10-15             #
#  Copyright (c) 2004               #
#  A.R. Pisarra, SophiaKnows        #
# --------------------------------- #
# --------------------------------- #

# This version of library search adds 
# a negative word feature to the basic
# search method


# FILES
$the_dir="/data/";
$wordindex=$the_dir."index.txt";
$pagekey="$the_dir."pages.txt";

# INCLUDE/INVOKE PARSEFORM
require "parseform.pl";
&Parse_Form;

# SEARCH WORD(S)
$searchwords=$formdata{'restrictions'};
$searchwords=~tr/A-Z/a-z/;
@searchwords=split(" ",$searchwords);

# INITIALIZE SEARCH INDEX
&InitWordlog($wordindex);

# INITIALIZE PAGE KEY
&InitPageKey($pagekey);

# FIND MATCHED ENTRIES
foreach $searchword (@searchwords) {
   unless($searchword=~/^\^/) {
      if($wordhistories{$searchword}) {
         @wordhits=split(/\,/,$wordhistories{$searchword});
         foreach $pagehit (@wordhits) {
            ($pid,$hits)=split(/=/,$pagehit);
            $pagematches{$pid}++;
            $matches{$pid}+=$hits;
            }
         }
      } else {    # CHECK FOR STOPWORDS
         $stopwords++;
         $searchword=~s/^\^//;
         if($wordhistories{$searchword}) {
            @wordhits=split(/\,/,$wordhistories{$searchword});
            foreach $pagehit (@wordhits) {
               ($pid,$hits)=split(/=/,$pagehit);
               $notmatches{$pid}++;
               }         
         }
      }
   }
# ID FILES NOT INCLUDING STOPWORD HITS
if($stopwords) {
   foreach $key (keys(%matches)) {
      unless($notmatches{$key}) {
         $pagematches{$key}=$pagematches{$key}+$stopwords;
         }
      }
   }

# PRINT RESULTS

print "Content-type: text/html\n\n";
print "<p>PAGE HITS IN DESCENDING RANKED ORDER:</p>\n";
foreach $key (sort HashByDescendingValues (keys(%matches))) {
   if($pagematches{$key}>$#searchwords) {
      print "<br />$matches{$key} HITS \@ ";
      print "<a href=$pagefiles{$key}>$pagetitles{$key}</a>\n";
      }
   } 

# SUBROUTINES:

# SUB: InitWordlog
sub InitWordlog {
   open(WORDLIST,"$_[0]");
   @lines=<WORDLIST>;
   close(WORDLIST);
   foreach $line (@lines) {
      chomp($line);
      ($storedword,$history)=split(/\",\"/,$line);
      $wordhistories{$storedword}=$history;
      }
   }

# SUB: InitPageKey
sub InitPageKey {
   open(PAGEKEY,"$pagekey");
   @pages=<PAGEKEY>;
   close(PAGEKEY);   
   foreach $page (@pages) {
      chomp($page);
      ($pageid,$pagefile,$pagetitle)=split(/\t/,$page);
      $pagefiles{$pageid}=$pagefile;
      $pagetitles{$pageid}=$pagetitle;
      }
   }

# SUB: HashByDescendingValues
sub HashByDescendingValues {
   $matches{$b} <=> $matches{$a};
   }

1;



< CODEBASE | TOP^ | MAINPAGE >

Text & Design By Tony Pisarra
© SophiaKnows 1998-2004