"""pageget.py: Functions for the Apache access_log recording page fetches. """ #================================================================ # IMPORTS #---------------------------------------------------------------- import sys import os import re import urllib import datetime # - - - - - c l a s s F i x e d T i m e Z o n e class FixedTimeZone(datetime.tzinfo): '''Represents a time zone that's always the same offset from UTC. Exports: FixedTimeZone ( mmEast, name ): [ (mmEast is a zone offset represented by the number of minutes east of UTC, or negative for west) and (name is a time zone name as a string) -> return a new FixedTimeZone instance with those values ] .utcoffset(dt): [ dt is a datetime.datetime instance -> return self's offset as a datetime.timedelta instance ] .tzname(dt): [ return self's name ] .dst(dt): [ return a datetime.timedelta of zero ] State/Invariants: .__offset: [ self's offset as a datetime.timedelta ] .__name: [ self's name, as passed to constructor ] .ZERO: [ a zero datetime.timedelta ] ''' ZERO = datetime.timedelta ( 0 ) def __init__ ( self, mmEast, name ): '''Constructor ''' self.__offset = datetime.timedelta ( minutes=mmEast ) self.__name = name def utcoffset ( self, dt ): return self.__offset def tzname ( self, dt ): return self.__name def dst ( self, dt ): return self.ZERO # - - - s c a n A c c e s s L o g - - - def scanAccessLog ( logFile ): """Read an Apache access_log, generating a stream of PageGet objects. [ if logFile is a readable file handle -> logFile := logFile advanced to end of file sys.stderr +:= messages about lines from logFile that aren't valid access_log lines, if any generate a sequence of PageGet objects representing the valid access_log lines from logFile ] """ #-- 1 -- errCount = 0 #-- 2 -- for rawLine in logFile: #-- 2 body -- # [ if rawLine is a valid access_log line -> # yield a sequence of PageGet objects representing that line # else -> # errCount +:= 1 ] #-- 2.1 -- # [ if rawLine is a valid access_log line -> # getList := a list of one or more PageGet objects # representing that line # else -> # getList := an empty list # sys.stderr +:= error message ] try: getList = scanAccessLine ( rawLine ) except SyntaxError: errCount += 1 getList = [] #-- 2.2 -- # [ generate the elements of getList ] for get in getList: yield get #-- 3 -- # [ if errCount > 0 -> # sys.stderr +:= message about (errCount) errors # else -> I ] if errCount > 0: error ( "Count of unrecognizeable access_log lines: %d" % errCount ) # - - - s c a n A c c e s s L i n e - - - def scanAccessLine ( rawLine ): """Process one access_log line; returns zero or more PageGets. [ if rawLine is a valid Apache access log line -> return a list of zero or more PageGet instances representing the accesses described by rawLine else -> sys.stderr +:= error message raise SyntaxError ] """ #-- 1 -- # [ if rawLine looks like a valid access_log line at the group level -> # accessGroup := contents of accessor group as a string # dateGroup := contents of date group as a string # cmdGroup := contents of command group as a string # tailGroup := contents of tail group as a string # else -> # sys.stderr +:= error message # return an empty list ] try: (accessGroup, dateGroup, cmdGroup, tailGroup) = scanGroups ( rawLine ) except ValueError, detail: error ( "Group syntax error, %s: %s\n" % (detail, rawLine) ) raise SyntaxError #-- 2 -- # [ if accessGroup is a valid host-group -> # accessorList := effective host list from accessGroup # username := username from accessGroup, or "-" if none # else -> # sys.stderr +:= error message # return an empty list ] try: accessorList, username = scanAccessGroup ( accessGroup ) except ValueError, result: error ( "Host group error, %s: %s" % (result, rawLine) ) raise SyntaxError #-- 3 -- # [ if dateGroup is a valid date/time -> # when := that date as a datetime.datetime instance # else -> # sys.stderr +:= error message # return an empty list ] try: when = scanDateGroup ( dateGroup ) except ValueError, result: error ( "Date group error, %s: %s" % (result, rawLine) ) raise SyntaxError #-- 4 -- # [ if cmdGroup is a valid command group -> # command := command from cmdGroup # url := URL from cmdGroup # else if the command in cmdGroup is not "GET" or "POST" -> # return an empty list # else -> # sys.stderr +:= error message # return an empty list ] try: command, url = scanCmdGroup ( cmdGroup ) if command not in ("GET", "POST"): return [] if url.startswith ('//'): url = url[1:] except ValueError, result: error ( "Command group error, %s: %s" % (result, rawLine) ) raise SyntaxError #-- 5 -- # [ if tailGroup is a valid tail group -> # status := status from tailGroup # else -> # sys.stderr +:= error message # return an empty list ] try: status = scanTailGroup ( tailGroup ) except ValueError, result: error ( "Tail group error, %s: %s" % ( result, rawLine ) ) raise SyntaxError #-- 6 -- # [ return a list of PageGet objects for each accessor in # accessorList, using (when, command, url, status) # for the other values ] return [ PageGet ( a, username, when, command, url, status) for a in accessorList ] # - - - s c a n G r o u p s - - - ACCESSOR_GROUP = "a" # Group ID for accessor group DATE_GROUP = "d" # Group ID for date group FRONT_RE = re.compile ( # Matches first two groups r'^' # Start-of-line anchor r'(?P<%s>' # Start ACCESSOR_GROUP r'[^\[]+' # Everything up to the next '[' r')' # End ACCESSOR_GROUP r'\[' # Open bracket for the date group r'(?P<%s>' # Start DATE_GROUP r'[^\]]+' # Everything up to the next ']' r')' # End DATE_GROUP r'\] ' % # Trailing bracket and space (ACCESSOR_GROUP, DATE_GROUP) ) def scanGroups ( rawLine ): """Break an access_log line down into its major groups. [ if rawLine is a valid access_log line at the group level -> return (accessor group, date group, cmd group, tail group) as a sequence of strings else -> raise ValueError ] """ #-- 1 -- # [ if rawLine starts with a pattern that matches FRONT_RE -> # accessorGroup := group ACCESSOR_GROUP from the match # dateGroup := group DATE_GROUP from the match # rest := rawLine beyond the match # else -> raise ValueError ] m = FRONT_RE.match ( rawLine ) if m is None: raise ValueError, "access_log group syntax" else: accessorGroup = m.group ( ACCESSOR_GROUP ) dateGroup = m.group ( DATE_GROUP ) rest = rawLine[m.end():] #-- 2 -- # [ if rest starts with a double-quoted strings, possibly # including escaped double-quote characters -> # commandGroup := contents of that string (with escaped # quotes unescaped) # tailGroup := rest, past that string # else -> raise ValueError ] commandGroup, tailGroup = scanQuoted ( rest ) #-- 3 -- return (accessorGroup, dateGroup, commandGroup, tailGroup) # - - - s c a n Q u o t e d - - - def scanQuoted ( s ): """Remove a double-quoted string from the front of s, with escaping [ s is a string -> if s starts with a double-quoted string with escaping -> return (contents of that string with escaped quotes unescaped, remainder of s) else -> raise ValueError ] """ #-- 1 -- pos = 0 L = [] #-- 2 -- # [ if s starts with '"' -> # pos := 1 # else -> raise ValueError ] if s[0] == '"': pos = 1 else: raise ValueError, ( "Expecting '\"' at the start of the " "command group." ) #-- 3 -- # [ pos := pos advanced to character after closing quote # or end of s, whichever comes first # L +:= characters between s[pos:] and closing quote or # end of s, whichever comes first ] while ( ( pos < len(s) ) and ( s[pos] != '"' ) ): #-- 3 body -- # [ if s[pos:] starts with '\"' -> # L +:= s[pos+1] # pos +:= 2 # else -> # L +:= s[pos] # pos +:= 1 ] if ( s[pos:pos+2] == r'\"' ): L.append ( s[pos+1] ) pos += 2 else: L.append ( s[pos] ) pos += 1 #-- 4 -- # [ if pos < len(s) -> # return (elements of L concatenated, s[pos+1:]) # else -> # raise ValueError ] if pos < len(s): return ("".join(L), s[pos+1:]) else: raise ValueError, ("No closing double-quote: '%s'" % s) # - - - s c a n A c c e s s G r o u p - - - def scanAccessGroup ( accessGroup ): """Determine the set of effective accessor IP addresses from accessGroup [ accessGroup is a string -> if accessGroup is a valid host-group -> return (effective host list from accessGroup, username from accessGroup or "-" if none) else -> sys.stderr +:= error message return an empty list ] """ #-- 1 -- # [ fieldList := fields of accessGroup separated by # whitespace, omitting trailing whitespace ] fieldList = accessGroup.rstrip().split(' ') #-- 2 -- # [ if fieldList consists of four or more fields of which the # next-to-last is "-" -> # priHost := first field of fieldList # secHostList := fields of fieldList from second on, # omitting last two # userName := last field of fieldList # else -> raise ValueError ] if ( ( len ( fieldList ) >= 2 ) and ( fieldList[-2] == "-" ) ): priHost = fieldList[0] secHostList = fieldList[1:-2] username = fieldList[-1] else: raise ValueError, ( "Badly formed accessor group: '%s'" % accessGroup ) #-- 3 -- # [ if secHostList is empty or contains only "-" or "" -> # hostList := [ priHost ] # else -> # hostList := secHostList with any trailing commas # removed from its elements ] hostList = findHostList ( priHost, secHostList ) #-- 4 -- if len(hostList) == 0: error ( "No hosts found: '%s'" % accessGroup ) return (hostList, username) # - - - f i n d H o s t L i s t - - - def findHostList ( priHost, secHostList ): """Derive the effective host list. [ (priHost is the primary host as a string) and (secHostList is a list of secondary host names, each possibly with a trailing comma) -> return the effective host list as a list of strings ] """ #-- 1 -- if len(secHostList) == 0: return [ priHost ] else: firstSec = secHostList[0] #-- 2 -- if ( ( len ( firstSec ) == 0 ) or ( firstSec == "-" ) ): return [ priHost ] #-- 3 -- hostList = [] #-- 4 -- # [ hostList +:= a list of the elements of secHostList with any # trailing commas removed ] for secHost in secHostList: if secHost.endswith(","): hostList.append ( secHost[:-1] ) else: hostList.append ( secHost ) #-- 5 -- return hostList # - - - s c a n D a t e G r o u p - - - DOM_CODE = "D" # Day of month field MON_CODE = "M" # Month field (e.g., "Jan") YYYY_CODE = "Y" # Year field HOUR_CODE = "h" # Hour field MIN_CODE = "m" # Minutes field SEC_CODE = "s" # Seconds field TZSIGN_CODE = "c" # Sign of zone correction TZHH_CODE = "Z" # Hours part of zone correction TZMM_CODE = "z" # Minutes part of zone correction datePat = re.compile ( r'(?P<%s>' # Start DOM_CODE group r'\d{2}' # Day of month r')' r'/' # Slash separator r'(?P<%s>' # Start MON_CODE group r'[a-zA-Z]{3}' # Three-letter month code r')' r'/' r'(?P<%s>' # Start YYYY_CODE group r'\d{4}' # Four-letter year r')' r':' # Colon separator r'(?P<%s>' # Start HOUR_CODE group r'\d{2}' # Hour r')' r':' # Colon separator r'(?P<%s>' # Start MIN_CODE group r'\d{2}' # Minute r')' r':' # Colon separator r'(?P<%s>' # Start SEC_CODE group r'\d{2}' # Second r')' r' ' # Matches one space r'(?P<%s>' # Start TZSIGN_CODE group r'[\-+]' # Matches '+' or '-' r')' r'(?P<%s>' # Start TZHH_CODE group r'\d{2}' # Two digits r')' r'(?P<%s>' # Start TZMM_CODE group r'\d{2}' # Two digits r')' % ( DOM_CODE, MON_CODE, YYYY_CODE, HOUR_CODE, MIN_CODE, SEC_CODE, TZSIGN_CODE, TZHH_CODE, TZMM_CODE ) ) monthDict = { "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12 } def scanDateGroup ( dateGroup ): """Extract the access timestamp from the raw dateGroup [ dateGroup is a string -> if dateGroup is a valid date/time -> return that date as an epoch time else -> raise ValueError ] """ #-- 1 -- # [ if dateGroup matches datePat -> # m := a Match object for that match # else -> # raise ValueError ] m = datePat.match ( dateGroup ) if m is None: raise ValueError, "Bad date format: '%s'" % dateGroup #-- 2 -- yyyy = int ( m.group ( YYYY_CODE ) ) mon = m.group ( MON_CODE ) dom = int ( m.group ( DOM_CODE ) ) hh = int ( m.group ( HOUR_CODE ) ) mm = int ( m.group ( MIN_CODE ) ) ss = int ( m.group ( SEC_CODE ) ) rawSign = m.group ( TZSIGN_CODE ) tzhh = int ( m.group ( TZHH_CODE ) ) tzmm = int ( m.group ( TZMM_CODE ) ) #-- 3 -- # [ if (mon is a valid three-character month code) -> # monthNo := the corresponding month number # else -> raise ValueError ] try: monthNo = monthDict [ mon.lower() ] except: raise ValueError, ( "Unknown month code '%s'" % mon.lower() ) #-- 4 -- # [ if rawSign is "-" -> # zone := a FixedTimeZone instance representing # (tzhh) hours and (tzmm) minutes west of UTC # else -> # zone := a FixedTimeZone instance representing # (tzhh) hours and (tzmm) minutes east of UTC ] zoneName = "%s%02d%02d" % (rawSign, tzhh, tzmm) if rawSign == "-": mmEast = - ( tzhh * 60 + tzmm ) else: mmEast = tzhh * 60 + tzmm zone = FixedTimeZone ( mmEast, zoneName ) #-- 5 -- return datetime.datetime ( yyyy, monthNo, dom, hh, mm, ss, 0, zone ) # - - - s c a n C m d G r o u p - - - def scanCmdGroup ( cmdGroup ): """Extract the command and URL from the command group [ if cmdGroup is a string -> if cmdGroup is a valid command group -> return (command from cmdGroup, URL from cmdGroup) else -> raise ValueError ] """ #-- 1 -- # [ wordList := cmdGroup broken up on space characters ] wordList = cmdGroup.split ( ' ' ) #-- 2 -- # [ if wordList[-1] starts with "HTTP" -> # wordList := wordList without its last element # else -> I ] if wordList[-1].startswith ( "HTTP" ): wordList.pop() #-- 3 -- # [ if url is empty -> # raise ValueError # else -> # decodedURL := url, with URL-encoding decoded, minus any # "?..." tail ] url = " ".join ( wordList[1:] ) if len(url) == 0: raise ValueError("The URL is missing: '%s'" % cmdGroup) else: decodedURL = cleanURL(url) #-- 3 -- # [ if decodedURL contains any null characters -> # raise ValueError # else -> # return (first element of wordList, decodedURL) ] if '\x00' in decodedURL: raise ValueError("Nulls disallowed: %r" % url) else: return (wordList[0], decodedURL) # - - - c l e a n U R L - - - def cleanURL ( rawURL ): """Remove from an URL any URL-encoding and "?..." tail [ if rawURL is a string -> return rawURL with URL encoding decoded and minus any CGI arguments ] """ #-- 1 -- # [ if rawURL contains any "?" characters -> # head := rawURL up to the first "?" character # else -> # head := rawURL ] L = rawURL.split ( "?" ) # Discard up to first "?" head = L[0] #-- 2 -- # [ unquoted := head with URL-encoded characters unquoted ] unquoted = urllib.unquote ( head ) #-- 3 -- # [ return unquoted with redundant "/" and ".." groups # removed and all characters with codes >= 0x80 # replaced by their URL-encoded forms ] return os.path.normpath ( asciifyString ( unquoted ) ) # - - - a s c i i f y S t r i n g - - - def asciifyString ( s ): """Like urllib.quote(), but only quotes non-ASCII characters. [ s is a string -> return s with all characters >= 0x80 escaped using URL encoding ] """ #-- 1 -- # [ cleaned := a list of the characters from s, with those # >= 0x80 replaced by the URL-encoded equivalent ] cleaned = [ asciifyChar(c) for c in list(s) ] #-- 2 -- return "".join ( cleaned ) # - - - a s c i i f y C h a r - - - def asciifyChar ( c ): """Escape c if it isn't ASCII, otherwise return c. [ c is a one-character string -> if ord(c) < 0x80 -> return c else -> return '%XX' where XX is ord(c) ] """ if ord(c) < 0x80: return c else: return "%%%2X" % ord(c) # - - - s c a n T a i l G r o u p - - - STATUS_FIELD = "c" # Result code, e.g., 200 ok, 404 not found tailPat = re.compile ( r' ' # Space before status code r'(?P<%s>' # Start STATUS_FIELD group r'\d+' # Result code: all leading digits r')' r' ' # One space % STATUS_FIELD ) def scanTailGroup ( tailGroup ): """Extract the status from the tail group. [ if tailGroup is a valid tail group -> return status from tailGroup else -> raise ValueError ] """ #-- 1 -- # [ if tailGroup matches tailPat -> # m := a Match object describing that match # else -> # raise ValueError ] m = tailPat.match ( tailGroup ) if m is None: raise ValueError, ( "Invalid status/length/referrer group '%s'" % tailGroup ) #-- 2 -- # [ return STATUS_FIELD from m as an integer ] status = int ( m.group ( STATUS_FIELD ) ) return status # - - - - - c l a s s P a g e G e t - - - - - class PageGet: """Class to represent one line from the Apache access_log Exports: PageGet ( accessor, username, when, command, url, status ): [ if (accessor is the remote site's IP address as a string) and (username is the username if the page is under password protection, "-" otherwise) and (when is the time of the access as a datetime.datetime) and (command is the command, e.g., "GET" or "POST") and (url is the URL the accessor wants to load) and (status is the usual HTTP return status as an integer) -> return a new PageGet object representing that access ] .accessor: [ as passed to constructor ] .username: [ as passed to constructor ] .when: [ as passed to constructor ] .command: [ as passed to constructor ] .url: [ as passed to constructor ] .status: [ as passed to constructor ] .isFar ( symDomain, ipDomain ): [ if (symDomain is a list of strings representing the symbolic form of the local domain such as ["nmt","edu"]) and (ipDomain is a list of strings representing the numeric form of the local domain such as ["129","138"]) -> if (self.accessor is a single name) or (its last parts match symDomain) or (its first parts match ipDomain) -> return False else -> return True ] str(self): [ return a string representing self for debug use ] """ # - - - P a g e G e t . _ _ i n i t _ _ - - - def __init__ ( self, accessor, username, when, command, url, status ): """Constructor for PageGet""" self.accessor = accessor self.username = username self.when = when self.command = command self.url = url self.status = status # - - - P a g e G e t . i s F a r - - - def isFar ( self, symDomain, ipDomain ): """Is this record from our domain? [ (symDomain is a list of the trailing parts of the local IP domain as strings) and (ipDomain is a list of the leading parts of the local IP domain as numbers in string form) -> if self represents a fetch from symDomain or ipDomain -> return False else -> return True ] """ #-- 1 -- # [ L := list of parts of self.accessor split on "." ] L = self.accessor.split ( "." ) #-- 2 -- # [ if L is at least as long as both symDomain and # ipDomain -> # I # else if L[0] is "unknown" -> # return False # else -> return True ] if ( ( len ( L ) < len ( symDomain ) ) or ( len ( L ) < len ( ipDomain ) ) ): if L[0] == "unknown": return True else: return False #-- 3 -- # [ if ipDomain matches the initial elements of L -> # return False # else -> I ] if ipDomain == L[:len(ipDomain)]: return False #-- 4 -- # [ if symDomain matches the final elements of L -> # return False # else -> # return True ] if symDomain == L[-len(symDomain):]: return False else: return True # - - - P a g e G e t . _ _ s t r _ _ - - - TIME_FORMAT = "%Y-%m-%dT%H:%M:%S %z" def __str__ ( self ): "Output self as a string" humanDate = self.when.strftime ( self.TIME_FORMAT ) if self.username != "-": isPassword = " [PWD]" else: isPassword = "" return ( "%s %s%s %s %s %s" % ( humanDate, self.accessor, isPassword, self.command, self.status, self.url ) ) # - - - e r r o r - - - ERROR_LOG = "/u/www/docs/tcc/webstats/error-log" def error ( *L ): """Send a message to the standard error stream. [ L is a list of strings -> sys.stderr +:= (error message prefix) + (elements of L, concatenated) + "\n" ] """ text = "*** %s\n" % "".join(L) message ( text ) # - - - m e s s a g e - - - def message ( text ): """Send a message to stderr and ERROR_LOG. [ text is a string -> sys.stderr +:= text file ERROR_LOG +:= text ] """ sys.stderr.write ( text ) logFile = open ( ERROR_LOG, "a" ) logFile.write ( text ) logFile.close()