#!/usr/bin/perl # ==================================================================== # Copyright (c) 2000 Astonish Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ==================================================================== # proclog.pl # blazonry.com # Created On: 2000-10-24 # DESC: Extract search terms out of log files # print out results to standard out #------------------------------------------------------ # CONFIGURE #------------------------------------------------------ # break out keywords # 1 = yes break into individual words # 0 = no. leave as search phrases my $keyword = 1; #------------------------------------------------------ my $logfile = $ARGV[0]; if (!$logfile) { print "Log File Not Specified.\n"; exit(1); } # My Apache Log Format # LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" # if ($line =~ m/(.*)\s(.*)\s(.*)\s(.*)\s"(.*)"\s(.*)\s(.*)\s"(.*)"\s"(.*)"/) { # $ip=$1; # user ip address # $l=$2; # $d=$3; # date # $t=$4; # time # $req=$5; # request # $sc=$6; # status code # $byte=$7; # byte # $ref=$8; # referer # $ua=$9; # user-agent # } open (LOG, "<$logfile") || die ("Unable to Open $logfile"); while ($line = ) { chomp($line); # parse line if ($line =~ m/(.*?)".*"\s.*\s"(.*)"\s.*/) { $ref=$2; # referer # check if there is a query string if ($ref =~ /\?/) { # parse referer for query terms # ** add more search sites ** # *** to save time order with most hit search site first *** if ($ref =~ m/http:\/\/www\.google\.com.*\?q=(.*?)&.*/) { $query = $1; } elsif ($ref =~ m/http:\/\/google\.yahoo\.com.*\?p=(.*?)&.*/) { $query =$1; } elsif ($ref =~ m/http:\/\/www\.altavista\.com.*\?q=(.*?)&.*/) { $query =$1; } else { $query = 0; } # put query in hash if ($query) { $query = urldecode($query); $query =~ tr/A-Z/a-z/; # lower case if ($keyword) { # split out keywords @words = split(" ",$query); foreach $word (@words) { $qhash{$word}++; } } else { $qhash{$query}++; } } } } else { print "Invalid Format.\n"; } } close(LOG); # write out results to standard output print ''."\n"; print "

Search Engine Query Terms


\n"; print ''."\n"; print ''."\n"; foreach $q (sort hashvaluesort(keys(%qhash))) { print "\n"; } print "
Query TermFrequency
$q$qhash{$q}
\n"; print "\n"; exit(); #---------------------------------------------------------------------------- #----------------------------------------- ## URL DECODE #----------------------------------------- sub urldecode{ local($val)=@_; $val=~s/\+/ /g; $val=~s/%([0-9A-H]{2})/pack('C',hex($1))/ge; return $val; } #----------------------------------------- ## HASH SORT #----------------------------------------- sub hashvaluesort { $qhash{$b} <=> $qhash{$a}; }