"""pageget.py: Functions for the Apache access_log recording page fetches.
$Revision: 1.10 $ $Date: 2003/02/16 03:02:44 $
Exports:
def scanAccessLog(logFile): Converts an access_log to PageGet objects
[ if logFile is a readable file handle ->
logFile := logFile advanced to end of file
sys.stderr +:= messages about lines from logFile that
aren't valid access_log lines, if any
generate a sequence of PageGet objects representing the
valid access_log lines from logFile ]
class PageGet: Represents one web page access (or attempt)
PageGet ( accessor, username, when, command, url, status, referrer ):
[ if (accessor is the remote site's IP address as a string)
and (username is the username if the page is under
password protection, "-" otherwise)
and (when is the time of the access as an epoch time)
and (command is the command, e.g., "GET" or "POST")
and (url is the URL the accessor wants to load)
and (status is the usual HTTP return status as an integer)
and (referrer is the referring URL or "-" if unknown) ->
return a new PageGet object representing that access ]
.accessor: [ as passed to constructor ]
.username: [ as passed to constructor ]
.when: [ as passed to constructor ]
.command: [ as passed to constructor ]
.url: [ as passed to constructor ]
.status: [ as passed to constructor ]
.referrer: [ as passed to constructor ]
.isLocal ( symDomain, ipDomain ):
[ if (symDomain is a list of strings representing the
symbolic form of the local domain such as ["nmt","edu"])
and (ipDomain is a list of strings representing the
numeric form of the local domain such as ["129","138"]) ->
if (self.accessor is a single name)
or (its last parts match symDomain)
or (its first parts match ipDomain) ->
return 1
else -> return 0 ]
str(self):
[ return a string representing self for debug use ]
"""
#================================================================
# IMPORTS
#----------------------------------------------------------------
#--
# The next line is necessary to use generators with Python 2.2.
#--
from __future__ import generators
import sys # System library
import string # String functions
import time # Standard Python time module
import re # Regular expression module
import urllib # URL encoding and decoding functions
#================================================================
# VERIFICATION FUNCTIONS
#----------------------------------------------------------------
# eff-host-list ( host-group ) ==
# if sec-host-list(host-group) is empty or contains only "-" ->
# pri-host(host-group)
# else ->
# sec-host-list(host_group)
#--
# This describes the rule for deducing which original hosts
# fetched a page. See the specification for a discussion.
#----------------------------------------------------------------
# host-group ==
# The part of the access_log record up to the first "[",
# with syntax:
# pri-host " " [ sec-host ", " ... ] " - " username " "
# where username is usually "-" but contains a user name
# if the page is under password control.
#----------------------------------------------------------------
# clean-url ( url ) ==
# if url contains a "?" ->
# url up to the first "?" with URL encoding decoded
# else ->
# url with URL encoding decoded
#----------------------------------------------------------------
# - - - s c a n A c c e s s L o g - - -
def scanAccessLog ( logFile ):
"""Read an Apache access_log, generating a stream of PageGet objects.
[ if logFile is a readable file handle ->
logFile := logFile advanced to end of file
sys.stderr +:= messages about lines from logFile that
aren't valid access_log lines, if any
generate a sequence of PageGet objects representing the
valid access_log lines from logFile ]
"""
#-- 1 --
errCount = 0
#-- 2 --
for rawLine in logFile:
#-- 2 body --
# [ if rawLine is a valid access_log line ->
# yield a sequence of PageGet objects representing that line
# else ->
# errCount +:= 1 ]
#-- 2.1 --
# [ if rawLine is a valid access_log line ->
# getList := a list of one or more PageGet objects
# representing that line
# else ->
# getList := error message plus rawLine as a string ]
getList = scanAccessLine ( rawLine )
#-- 2.2 --
# [ if getList is a string ->
# errCount +:= 1
# else ->
# generate elements of getList ]
if type(getList) == str:
errCount += 1
#--
# For testing, uncomment the following line to log the
# failed lines
#--
sys.stderr.write ( "*** %s\n" % getList )
else:
for get in getList:
yield get
#-- 3 --
# [ if errCount > 0 ->
# sys.stderr +:= message about (errCount) errors
# else -> I ]
if errCount > 0:
sys.stderr.write ( "*** Count of unrecognizeable access_log lines: "
"%d\n" % errCount )
# - - - s c a n A c c e s s L i n e - - -
def scanAccessLine ( rawLine ):
"""Process one access_log line. May generate zero, one, or many PageGets.
[ if rawLine is a string ->
if rawLine is a valid access_log line ->
return a list of one or more PageGet objects representing
that line
else ->
return a string containing error text and rawLine ]
"""
#-- 1 --
# [ if rawLine looks like a valid access_log line at the group level ->
# accessGroup := contents of accessor group as a string
# dateGroup := contents of date group as a string
# cmdGroup := contents of command group as a string
# tailGroup := contents of tail group as a string
# else ->
# return an error string ]
try:
(accessGroup, dateGroup, cmdGroup, tailGroup) = scanGroups (
rawLine )
except ValueError, detail:
return "Group syntax error, %s: %s" % (detail, rawLine)
#-- 2 --
# [ if accessGroup is a valid host-group ->
# accessorList := eff-host-list ( accessGroup )
# username := username from accessGroup, or "-" if none
# else ->
# return an error string ]
try:
accessorList, username = scanAccessGroup ( accessGroup )
except ValueError, result:
return "Host group error, %s: %s" % (result, rawLine)
#-- 3 --
# [ if dateGroup is a valid date/time ->
# when := that date as an epoch time
# else ->
# return an error string ]
try:
when = scanDateGroup ( dateGroup )
except ValueError, result:
return "Date group error, %s: %s" % (result, rawLine)
#-- 4 --
# [ if cmdGroup is a valid command group ->
# command := command from cmdGroup
# url := URL from cmdGroup
# else ->
# return an error string ]
try:
command, url = scanCmdGroup ( cmdGroup )
except ValueError, result:
return "Command group error, %s: %s" % ( result, rawLine )
#-- 5 --
# [ if tailGroup is a valid tail group ->
# status := status from tailGroup
# referrer := referring URL from tailGroup
# else ->
# return an error string ]
try:
status, referrer = scanTailGroup ( tailGroup )
except ValueError, result:
return "Tail group error, %s: %s" % ( result, rawLine )
#-- 6 --
# [ return a list of PageGet objects for each accessor in
# accessorList, using (when, command, url, status, referrer)
# for the other values ]
return [ PageGet ( a, username, when, command, url, status, referrer)
for a in accessorList ]
# - - - s c a n G r o u p s - - -
#--
# Declarations for regular expressions. The log record is so complex
# we use two levels of recognition: first we divide it up according to:
# accessor-group '[' date-group '] "' cmd-group '" ' tail-group
# The _GROUP declarations are the field codes used in the re package's
# (?P<field-code>) construct; linePat is the compiled regular expression.
#--
ACCESSOR_GROUP = "a" # Accessor/proxy information
DATE_GROUP = "d" # Date, time, zone correction
CMD_GROUP = "c" # Command, URL, protocol
TAIL_GROUP = "t" # Result code, length, referrer
linePat = re.compile ( # Compiled reg.ex. for the whole line
r'^' # Anchor to start of line
r'(?P<%s>[^\[]+)' # accessor-group
r'\[' # Open bracket
r'(?P<%s>[^\]]+)' # date-group
r'\] "' # Close bracket, space, double-quote
r'(?P<%s>[^"]+?)' # cmd-group
r'" ' # Double-quote, space
r'(?P<%s>.*$)' # tail-group gets the rest (incl. '\n')
% ( ACCESSOR_GROUP, DATE_GROUP, CMD_GROUP, TAIL_GROUP ) )
def scanGroups ( rawLine ):
"""Break an access_log line down into its major groups.
[ if rawLine is a valid access_log line at the group level ->
return (accessor group, date group, cmd group, tail group)
as a sequence of strings
else ->
raise ValueError ]
"""
#-- 1 --
# [ if rawLine matches linePat ->
# m := a Match object describing the match
# else ->
# return None ]
m = linePat.match ( rawLine )
if m is None:
raise ValueError, "access_log group syntax"
#-- 2 --
# [ return (contents of ACCESSOR_GROUP field of m,
# contents of DATE_GROUP field of m, contents of CMD_GROUP
# field of m, contents of TAIL_GROUP field of m) ]
return ( ( m.group ( ACCESSOR_GROUP ), m.group ( DATE_GROUP ),
m.group ( CMD_GROUP ), m.group ( TAIL_GROUP ) ) )
# - - - s c a n A c c e s s G r o u p - - -
def scanAccessGroup ( accessGroup ):
"""Determine the set of effective accessor IP addresses from accessGroup
[ if accessGroup is a string ->
if accessGroup is a valid host-group ->
return (eff-host-list ( accessGroup ), username from accessGroup )
else ->
raise ValueError ]
"""
#-- 1 --
# [ L := accessGroup, broken into pieces at whitespace ]
L = accessGroup.split()
#-- 2 --
# [ if the next-to-last element of L is not "-" ->
# raise ValueError
# else ->
# priHost := L[0]
# secHostList := L[1:-2]
# username := L[-1] ]
if ( ( len ( L) >= 2 ) and
( L[-2] == "-" ) ):
priHost = L[0]
secHostList = L[1:-2]
username = L[-1]
else:
raise ValueError, ( "Penultimate accessor fields not '-': %s" %
accessGroup )
#-- 3 --
# [ if secHostList is empty or contains only "-" ->
# hostList := [ priHost ]
# else ->
# hostList := secHostList with any trailing commas
# removed from its elements ]
if ( ( len ( secHostList ) < 1 ) or
( ( len ( secHostList ) == 1 ) and
( secHostList[0] == "-" ) ) ):
hostList = [ priHost ]
else:
hostList = []
for secHost in secHostList:
if secHost[-1] == ",":
hostList.append ( secHost[:-1] )
else:
hostList.append ( secHost )
#-- 4 --
return (hostList, username)
# - - - s c a n D a t e G r o u p - - -
DOM_CODE = "D" # Day of month field
MON_CODE = "M" # Month field (e.g., "Jan")
YYYY_CODE = "Y" # Year field
HOUR_CODE = "h" # Hour field
MIN_CODE = "m" # Minutes field
SEC_CODE = "s" # Seconds field
datePat = re.compile (
r'(?P<%s>[0-9]{2})' # Day of month
r'/'
r'(?P<%s>[a-zA-Z]{3})' # Three-letter month code
r'/'
r'(?P<%s>[0-9]{4})' # Four-letter year
r':'
r'(?P<%s>[0-9]{2})' # Hour
r':'
r'(?P<%s>[0-9]{2})' # Minute
r':'
r'(?P<%s>[0-9]{2}) ' # Second
% ( DOM_CODE, MON_CODE, YYYY_CODE, HOUR_CODE, MIN_CODE,
SEC_CODE ) )
monthDict = { "jan": 1, "feb": 2, "mar": 3, "apr": 4,
"may": 5, "jun": 6, "jul": 7, "aug": 8,
"sep": 9, "oct": 10, "nov": 11, "dec": 12 }
def scanDateGroup ( dateGroup ):
"""Extract the access timestamp from the raw dateGroup
[ if dateGroup is a string ->
if dateGroup is a valid date/time ->
return that date as an epoch time
else ->
raise ValueError ]
NB: We ignore the zone correction given in the record,
because time.mktime() expects a local time. This may
actually cause a record to be assigned to the wrong day
during daylight time transitions. If that is a problem,
extract the zone correction along with the other times,
convert it to GMT, and then subtract time.timezone
to re-correct it to local civil time.
"""
#-- 1 --
# [ if dateGroup matches datePat ->
# m := a Match object for that match
# else ->
# raise ValueError ]
m = datePat.match ( dateGroup )
if m is None:
raise ValueError, "Bad date format: <%s>" % dateGroup
#-- 2 --
yyyy = int ( m.group ( YYYY_CODE ) )
mon = m.group ( MON_CODE )
dom = int ( m.group ( DOM_CODE ) )
hour = int ( m.group ( HOUR_CODE ) )
min = int ( m.group ( MIN_CODE ) )
sec = int ( m.group ( SEC_CODE ) )
#-- 3 --
# [ if (mon is a valid three-character month code) ->
# monthNo := the corresponding month number
# else -> raise ValueError ]
try:
monthNo = monthDict [ string.lower ( mon ) ]
except:
raise ValueError, "Unknown month code <%s>" % string.lower ( mon )
#-- 4 --
# [ timeTuple := a time-tuple (for module time) made from year yyyy,
# month monthNo, day dom, hour hour, minute min, second sec ]
# [0] [1] [2] [3] [4] [5]
timeTuple = ( yyyy, monthNo, dom, hour, min, sec,
# [6] day of week, [7] ordinal day of year, [8] dst
0, 0, 0 )
#-- 5 --
# [ return (timeTuple, treated as local time, converted to epoch time) ]
result = time.mktime ( timeTuple )
return result
# - - - s c a n C m d G r o u p - - -
CMD_FIELD = "c" # Command, usually GET or POST
URL_FIELD = "u" # URL
cmdPat = re.compile (
r'(?P<%s>\w+)' # Command: one or more letters
r' ' # Space after command
r'(?P<%s>[^ ]+)' # URL: everything up to the next space
r' ' # Space after URL
% ( CMD_FIELD, URL_FIELD ) )
def scanCmdGroup ( cmdGroup ):
"""Extract the command and URL from the command group
[ if cmdGroup is a string ->
if cmdGroup is a valid command group ->
return (command from cmdGroup, URL from cmdGroup)
else ->
raise ValueError ]
"""
#-- 1 --
# [ if cmdGroup matches cmdPat ->
# m := a Match object describing the match
# else ->
# raise ValueError ]
m = cmdPat.match ( cmdGroup )
if m is None:
raise ValueError, "Unrecognizable command/url/protocol group."
#-- 2 --
# [ command := CMD_FIELD from m
# url := clean-url ( URL_FIELD from m )
command = m.group ( CMD_FIELD )
url = cleanURL ( m.group ( URL_FIELD ) )
#-- 3 --
return (command, url)
# - - - c l e a n U R L - - -
def cleanURL ( rawURL ):
"""Remove from an URL any URL-encoding and "?..." tail
[ if rawURL is a string ->
return clean-url ( rawURL ) ]
"""
#-- 1 --
# [ if rawURL contains any "?" characters ->
# url := rawURL up to the first "?" character
# else ->
# url := rawURL ]
L = string.split ( rawURL, "?" ) # Discard up to first "?"
head = L[0]
#-- 2 --
# [ return head with URL-encoded characters unquoted ]
return urllib.unquote ( head )
# - - - s c a n T a i l G r o u p - - -
STATUS_FIELD = "c" # Result code, e.g., 200 ok, 404 not found
REFERRER_FIELD = "r" # Referring URL or "-"
tailPat = re.compile (
r'(?P<%s>\d+)' # Result code: all leading digits
r' ' # One space
r'[^ ]*' # Length, everything up to next space
r' "' # One space, opening double quote
r'(?P<%s>[^"]*)' # Referrer URL, up to next double quote
r'"' # Closing double quote
% ( STATUS_FIELD, REFERRER_FIELD ) )
def scanTailGroup ( tailGroup ):
"""Extract the status and referring URL from the tail group.
[ if tailGroup is a string ->
if tailGroup is a valid tail group ->
return (status from tailGroup, referring URL from tailGroup)
else ->
raise ValueError ]
"""
#-- 1 --
# [ if tailGroup matches tailPat ->
# m := a Match object describing that match
# else ->
# raise ValueError ]
m = tailPat.match ( tailGroup )
if m is None:
raise ValueError, ( "Invalid status/length/referrer group <%s>" %
tailGroup )
#-- 2 --
# [ status := STATUS_FIELD from m as an integer
# referrer := clean-url ( REFERRER_FIELD from m ) ]
status = int ( m.group ( STATUS_FIELD ) )
referrer = cleanURL ( m.group ( REFERRER_FIELD ) )
#-- 3 --
return (status, referrer)
# - - - - - c l a s s P a g e G e t - - - - -
class PageGet:
"Class to represent one line from the Apache access_log"
# - - - P a g e G e t . _ _ i n i t _ _ - - -
def __init__ ( self, accessor, username, when, command, url,
status, referrer ):
"Constructor for PageGet"
self.accessor = accessor
self.username = username
self.when = when
self.command = command
self.url = url
self.status = status
self.referrer = referrer
# - - - P a g e G e t . i s L o c a l - - -
def isLocal ( self, symDomain, ipDomain ):
"""Is this record from our domain?
Note: We expect that the accessor will be either a single name
or a form like "a.b.c.d". An accessor is considered
local in the former case, or if its trailing parts
match symDomain, or if its leading parts match ipDomain.
"""
#-- 1 --
L = string.split ( self.accessor, "." )
#-- 2 --
if ( ( len ( L ) < len ( symDomain ) ) or
( len ( L ) < len ( ipDomain ) ) ):
if L[0] == "unknown":
return 0 # Unknown sites considered off-site.
else:
return 1 # Too short to be off-site => local.
#-- 3 --
# [ if ipDomain matches the first elements of L -> return 1
# else -> I ]
match = 1
for i in range ( len ( ipDomain ) ):
if L[i] != ipDomain[i]:
match = 0
break
if match:
return 1 # Leading IPs match: it's local.
#-- 4 --
# [ if symDomain matches the last elements of L -> return 1
# else -> I ]
for i in range ( -1, -1-len(symDomain), -1):
if L[i] != symDomain[i]:
return 0
#-- 5 --
return 1 # It's local.
# - - - P a g e G e t . _ _ s t r _ _ - - -
TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
def __str__ ( self ):
"Output self as a string"
humanDate = time.strftime ( "%Y%m%d %H%M%S",
time.localtime(self.when) )
if self.username != "-":
isPassword = " [PWD]"
else:
isPassword = ""
return ( "%s %s%s %s %s %s" %
( humanDate, self.accessor, isPassword,
self.command, self.status, self.url ) )