Next / Previous / TCC home / NMT home

Source listing for webstats.py

logo
#!/usr/local/bin/python
#--
# webstats:  Builds a web-based report showing statistics on recent access to
#   individual web pages.
#       $Revision: 1.12 $ $Date: 2004/01/28 01:56:06 $
# For documentation, see /u/www/docs/tcc/projects/weblog/weblog.{sgml,ps}
#
# Database formats---both are textual data files:
# (1) : lines of the form
# (2) sum-db: lines of the form
#--

#--
# Because the pageget.py module uses generators (a new feature in
# Python 2.2), the following import must precede all others:
#--

from __future__ import generators

import sys, os, string, time    # Standard Python modules
from pageget import *           # Routines to handle access logs

PRODUCT_NAME      =  "webstats"
EXTERNAL_VERSION  =  "1.1"

#================================================================
# CONTENTS
#================================================================
# updateDatabase():  Merge new log with database, discard expired records
#   expireDatabase():  Discard expired records from database
#     expireFile():  Work through the database, expiring records
#       expireLine():  Check one cur-db record for expiration
#   addNew():  Merge new log with database, discard expired records
#     translateLog():  Summarize raw log by URL and date
#       readNewLog():  Read raw log entries from <stdin>
#         isGoodPageGet():  Filter out uninteresting accesses
#         addPageGet():  Add one access to the database
# sumDatabase():  Build db of hits summarized over all dates
#   sumFlush():  Flush one URL's worth of summary
# writeWeb():  Generate reports from the database as web pages
#   buildHomePage():  Build root page of report web
#     showSummary():  Show time range and total hits
#   buildURLPage():  Build pages showing hits sorted by URL
#     buildSegments():  Build pages by first segment of URL
#       checkSegChange():  Start new segment page if necessary
#         segFileStart():  Start writing a new segment page
#         segFileFinish():  Finish writing the last segment page
#       showUsage():  Write one detail line of a usage report
#   buildHitsPage():  Build pages showing URLs sorted by total hits
#     buildHitsLines():  Transform hits-db into page contents
#================================================================

#================================================================
# BLANKET PRECONDITIONS:  Any intended function may assume that:
# - This script has read and write access to files old-db, cur-db,
#   new-db, add-db, and sum-db
# - Subdirectory BASE_DIR+SEG_DIR exists
#----------------------------------------------------------------

#================================================================
# VERIFICATION FUNCTIONS
#----------------------------------------------------------------
# add-db  ==
#   the file named ADD_DB_NAME in BASE_DIR, in daily-db-format
#----------------------------------------------------------------
# cur-db  ==
#   the file named DB_NAME in BASE_DIR, in daily-db-format
#--
#   This is the database that persists between runs.
#----------------------------------------------------------------
# daily-db-format ==
#   a line of the form
#       yyyymmdd|nNear|nFar|url
#   where yyyymmdd is the date, nNear is the number of local accesses,
#   nFar is the number of off-site accesses, and url is the URL
#--
#   Summarizes hits on one page within one local calendar day.
#   Used for cur-db, new-db, old-db, add-db.
#----------------------------------------------------------------
# first-segment(url) ==
#   if url contains no slashes -> ""
#   else -> portion of URL between first slash and second slash
#   (or end of line)
#--
#   Formerly, the report by URLs was one huge page.  For November
#   1999 it was 28 megabytes!  In the interest of easing the pain
#   of people on slow links, the URLs are broken up by their
#   "first segment"---e.g., "/mainpage" for the PIO, "~username", etc.
#----------------------------------------------------------------
# good-page-get(pageGet)  ==
#   if (pageGet.username == "-") and
#   (pageGet.status < BAD_STATUS_THRESHOLD) and
#   (pageGet.accessor is not a key in SPIDERS) and
#   (pageGet.url does not end in an extension that is a key in
#   IMAGE_EXTENSIONS) ->
#     1
#   else -> 0
#--
#   In order to be counted by this program, an access
#   (represented by a PageGet object) must meet these conditions:
#     - It can't require a password.  The .username field is
#       usually "-", but contains the username if Apache has
#       granted them password success.
#     - Its status code must indicate a successful fetch.
#     - It can't be an image file.
#----------------------------------------------------------------
# hits-db  ==
#   the file named HITS_DB_NAME, in sum-db-format
#--
#   sum-db, sorted by (total hits, URL)
#----------------------------------------------------------------
# old-db  ==
#   the file named OLD_DB_NAME in BASE_DIR, in daily-db-format
#--
#   This is a copy of the previous cur-db after updating.
#----------------------------------------------------------------
# new-db  ==
#   the file named NEW_DB_NAME in BASE_DIR, in daily-db-format
#--
#   This is the name used for a new database before renaming.
#----------------------------------------------------------------
# report-by-hits ( db, now, cutoffTime, nNear, nFar )  ==
#   Web page summarizing the time interval [cutoffTime,
#   now] and totals of nNear local hits and nFar offsite hits,
#   and listing all pages with more than MIN_HITS hits in
#   descending order by (total number of hits, URL)
#----------------------------------------------------------------
# report-by-hits-file  ==
#   the file named HITS_PAGE in BASE_DIR
#----------------------------------------------------------------
# report-homepage ( db, now, cutoffTime, nNear, nFar ) ==
#   Web page summarizing the time interval [cutoffTime,
#   now] and totals of nNear local hits and nFar offsite hits
#   and linking to report-by-hits and reports-by-url
#----------------------------------------------------------------
# report-homepage-file  ==
#   the file named HOME_PAGE in BASE_DIR
#----------------------------------------------------------------
# report-line ( nHits, nFar, url )  ==
#   line displaying nHits total hits, nFar as a percentage of
#   nHits, and url
#----------------------------------------------------------------
# report-web ( db, now, cutoffTime, nNear, nFar ) ==
#   report-homepage + report-by-hits + reports-by-url
#--
#   Web pages describing access records by URL and by number of
#   hits from db, and summarizing the time interval [cutoffTime,
#   now] and totals of nNear local hits and nFar offsite hits
#----------------------------------------------------------------
# report-web-files ( db, now, cutoffTime, nNear, nFar )  ==
#   report-homepage-file + report-by-hits-file + report-by-url-files
#----------------------------------------------------------------
# reports-by-url ( db, now, cutoffTime, nNear, nFar )  ==
#   Web page summarizing the time interval [cutoffTime,
#   now] and totals of nNear local hits and nFar offsite hits,
#   and showing a line for each different first-segments of URLs
#   in db (in sum-db-format), with each line a link to a page
#   showing the hits on URLs starting with that segment,
#   alphabetized by URL
#----------------------------------------------------------------
# report-by-url-files  ==
#   the file named URL_PAGE in BASE_DIR, plus segment-page-files
#----------------------------------------------------------------
# segment-pages(sum-db)  ==
#   a set of pages, one per first-segment(url) for all URLs in
#   sum-db, each page containing all URLs starting with that
#   first-segment, alphabetized by URL
#----------------------------------------------------------------
# segment-page-files  ==
#   the pages named s00000.html, s00001.html, ..., in subdirectory
#   SEG_DIR of BASE_DIR, containing the pages for each segment
#----------------------------------------------------------------
# sum-db  ==
#   the file named SUM_DB_NAME in BASE_DIR, is sum-db-format
#--
#   Temporary file summing the accesses for a page over the
#   dates in cur-db.
#----------------------------------------------------------------
# sum-db-format ==
#   a line of the form
#       nTotal|nFar|url
#   where nTotal=(nNear+nFar) and other fields are as in the
#   daily-db-format
#--
#   Summarizes all hits on one page since the cutoff time.
#   Used for sum-db.
#----------------------------------------------------------------

#================================================================
# Manifest constants
#----------------------------------------------------------------

#--
# Names of the `databases,' actually flat files
#--

BASE_DIR      =  "/u/www/docs/tcc/webstats/"    # Where pages and db's live
SEG_DIR       =  "seg/"             # Subdirectory where segment pages live
DB_NAME       =  "weblog_db"        # cur-db
NEW_DB_NAME   =  "new_db"           # new-db
OLD_DB_NAME   =  "old_db"           # old-db
ADD_DB_NAME   =  "add_db"           # add-db
SUM_DB_NAME   =  "sum_db"           # sum-db
HITS_DB_NAME  =  "hits_db"
FIELD_SEP     =  "|"                # Field separator in database files
HOME_PAGE     =  "homepage.html"    # Name of start page generated
HITS_PAGE     =  "byhits.html"      # Name of hit-parade page
URL_PAGE      =  "byurl.html"       # Name of by-URL page
HTML_SUFFIX   =  ".html"            # Suffix for Web pages
SEG_NAME_FMT  =  "s%05d"            # For creating segment page names
#   NB: The above format allows for 99,999 different first-segment
#   strings.  As of January 2000, there were under 200 in the access log.

#--
# Times and intervals
#--

EXPIRE_DAYS   =  31             # Number of days in the past we keep
EXPIRE_SEC    =  EXPIRE_DAYS * 24 * 60 * 60 # Same in seconds
TIME_FORMAT   =  "%Y-%m-%d"     # strftime() => "YYYY-MM-DD"

#--
# IMAGE_EXTENSIONS is a dictionary whose keys are the (uppercased)
#   extensions of files that are considered to be image files and
#   hence ignored in our statistical summaries.  CSS files are
#   also ignored.
# SPIDERS is a dictionary whose keys are the IP addresses
#   of search engine spiders to be ignored.
#--
IMAGE_EXTENSIONS  =  {
    ".JPG": 0,  ".JPEG": 0,  ".GIF": 0,  ".PNG": 0, ".CSS": 0}
SPIDERS  =  { "infohost": 0 }

#--
# Miscellaneous constants
#--

SYM_DOMAIN    =  ["nmt", "edu"]     # Local domain, symbolic form
IP_DOMAIN     =  ["129", "138"]     # Local domain, numeric form
MIN_HITS      =  10             # Lower limit for hits on by-hits page
BAD_STATUS_THRESHOLD  =  300    # Status codes >= this indicate failure



# - - -   u p d a t e D a t a b a s e   - - -

def updateDatabase ( cutoffTime ):
    """Merge log from stdin with existing database, discarding expired records

      [ if cutoffTime is an epoch time ->
          if cur-db does not exist ->
            cur-db  :=  a new database in daily-db-format representing
                        valid access log records from <stdin>, minus
                        any before cutoffTime
            return (total local accesses, total offsite accesses) from
            that database
          else ->
            cur-db  :=  a new database in daily-db-format representing
                        (cur-db + valid access log records from <stdin>),
                        minus any before cutoffTime ]
            return (total local accesses, total offsite accesses) from
            that database
    """

    #-- 1 --
    # [ if cur-db does not exist ->
    #     cur-db   :=  an empty file
    #     oldNear  :=  0
    #     oldFar   :=  0
    #   else ->
    #     cur-db   :=  cur-db minus records before cutoffTime ]
    #     oldNear  :=  total number of local accesses in cur-db minus
    #                  records before cutoffTime
    #     oldFar   :=  total number of offsite accesses in cur-db minus
    #                  records before cutoffTime
    oldNear, oldFar  =  expireDatabase ( cutoffTime )

    #-- 2 --
    # [ cur-db   :=  cur-db + (valid access log records from <stdin>,
    #                minus any before cutoffTime), in daily-db-format,
    #                sorted by (URL, date) ]
    #   newNear  :=  total number of offsite accesses in valid access
    #                log records from <stdin>, minus any before cutoffTime
    newNear, newFar  =  addNew ( cutoffTime )

    #-- 3 --
    return ( oldNear + newNear, oldFar + newFar )


# - - -   e x p i r e D a t a b a s e   - - -

def expireDatabase ( cutoffTime ):
    """Remove expired records from current database, if any

      [ if (cutoffTime is an epoch time) ->
          if cur-db does not exist ->
            cur-db  :=  an empty file
            return (0, 0)
          else ->
            cur-db  :=  cur-db minus records before cutoffTime ]
            return (number of local accesses in cur-db minus records
            before cutoffTime, number of offsite accesses in cur-db
            minus records before cutoffTime ] ]
    """
    #-- 1 --
    # [ if cur-db does not exist and can be created ->
    #     cur-db  :=  an empty file
    #     return (0, 0)
    #   else if cur-db can be opened for reading ->
    #     dbFile  :=  a readable file handle for cur-db ]
    dbName  =  BASE_DIR + DB_NAME
    try:
        dbFile  =  open ( dbName )
    except IOError:
        result = os.system("touch %s" % dbName )
        if result:
            sys.stderr.write("Can't create an empty webstat database.")
            sys.exit(1)
        return (0, 0)

    #-- 2 --
    # [ new-db  :=  records from dbFile, minus any before cutoffTime,
    #               in daily-db-format
    #   sums    :=  (# of local accesses, # of offsite accesses) in
    #               records from dbFile, minus any before cutoffTime ]
    sums  =  expireFile ( dbFile, cutoffTime )

    #-- 3 --
    # [ if (old-db, cur-db, and new-db are all writeable) ->
    #     old-db  :=  cur-db
    #     cur-db  :=  new-db
    #     new-db  :=  <deleted> ]
    try:
        oldName  =  BASE_DIR + OLD_DB_NAME
        if os.path.exists ( oldName ):
            os.remove ( oldName )
    except OSError, detail:
        sys.stderr.write ( "Can't delete the old database after "
                           "expiration:\n%s" % `detail` )
        sys.exit(1)

    try:
        os.rename ( BASE_DIR + DB_NAME,
                    BASE_DIR + OLD_DB_NAME )
    except OSError, detail:
        sys.stderr.write ( "Can't rename current -> old database after "
                           "expiration:\n%s" % `detail` )
        sys.exit(1)

    try:
        os.rename ( BASE_DIR + NEW_DB_NAME,
                    BASE_DIR + DB_NAME )
    except OSError, detail:
        sys.stderr.write ( "Can't rotate new -> current database after "
                           "expiration:\n%s" % `detail` )
        sys.exit(1)

    #-- 4 --
    return sums


# - - -   e x p i r e F i l e   - - -

def expireFile ( dbFile, cutoffTime ):
    """Copy records from dbFile to new-db, minus expired records

      [ if (dbFile is a readable file handle)
        and (cutoffTime is an epoch time in seconds) ->
          new-db  :=  records from dbFile, minus any before cutoffTime,
                      in daily-db-format ]
          return (# of local accesses, # of offsite accesses) in records
          from dbFile, minus any before cutoffTime
    """
    #-- 1 --
    # [ sumNear  :=  0
    #   sumFar   :=  0
    #   newFile  :=  a writeable file handle pointing to new-db ]
    sumNear  =  sumFar  =  0
    newFileName  =  BASE_DIR + NEW_DB_NAME
    try:
        newFile  =  open ( newFileName, "w" )
    except IOError:
        sys.stderr.write ( "Can't create the new database during "
                           "the expiration phase." )
        sys.exit(1)

    #-- 2 --
    # [ newFile  +:=  records from <stdin>, minus any before cutoffTime,
    #                 in daily-db-format ]
    line  =  dbFile.readline()

    while  line != "":
        #-- 2 body --
        # [ if line is a database line not before cutoffTime ->
        #     newFile  +:=  that database line
        #     line     :=   next line from dbFile, or "" if EOF
        #   else ->
        #     line     :=   next line from dbFile, or "" if EOF ]

        #-- 2.1 --
        # [ if line is a database line not before cutoffTime ->
        #     newFile  +:=  that database line
        #   else -> I ]
        nNear, nFar  =  expireLine ( cutoffTime, line, newFile )
        sumNear  =  sumNear + nNear
        sumFar   =  sumFar + nFar

        #-- 2.2 --
        # [ line     :=   next line from dbFile, or "" if EOF ]
        line  =  dbFile.readline()

    #-- 3 --
    newFile.close()
    dbFile.close()
    return (sumNear, sumFar)


# - - -   e x p i r e L i n e   - - -

def expireLine ( cutoffTime, line, newFile ):
    """If the line is not expired, write it to newFile.

      [ if (cutoffTime is an epoch time)
        and (line is a daily-db-format line as a string)
        and (newFile is a writeable file handle) _>
          if line has a time >= cutoffTime ->
            newFile  +:=  line
            return (# of local accesses, # of offsite accesses) from line
          else ->
            return (0, 0) ]
    """

    #-- 1 --
    # [ yyyymmdd  :=  the date from line ]
    L         =  string.split ( line, FIELD_SEP )
    yyyymmdd  =  L[0]
    nNear     =  string.atoi(L[1])
    nFar      =  string.atoi(L[2])

    #-- 2 --
    # [ lineTime  :=  date yyyymmdd as an epoch time ]
    yyyy  =  string.atoi(yyyymmdd[0:4])
    mm    =  string.atoi(yyyymmdd[4:6])
    dd    =  string.atoi(yyyymmdd[6:8])
    timeTuple     =  ( yyyy, mm, dd,  0, 0, 0,  0, 0, 0 )
    lineTime      =  time.mktime ( timeTuple )

    #-- 3 --
    # [ if lineTime >= cutoffTime ->
    #     newFile  +:=  line
    #   else -> I ]
    if  lineTime >= cutoffTime:
        newFile.write ( line )
        return (nNear, nFar)
    else:
        return (0, 0)


# - - -   a d d N e w   - - -

def addNew ( cutoffTime ):
    """Add valid, unexpired access log records from <stdin> to database.

      [ if cutoffTime is an epoch time ->
          cur-db  :=  cur-db + (valid access log records from <stdin>,
                      minus any before cutoffTime, in daily-db-format,
                      sorted by (URL, date))
          return (# of local accesses, # of offsite accesses) from
          valid access log records from <stdin>, minus any before cutoffTime ]
    """

    #-- 1 --
    # [ add-db  :=  valid access log records from <stdin>, minus any
    #               before cutoffTime, summarized in daily-db-format
    #   sums    :=  (# of local accesses, # of offsite accesses) from
    #               valid access log records from <stdin>, minus any
    #               before cutoffTime ]
    sums  =  translateLog ( cutoffTime )

    #-- 2 --
    # [ new-db  :=  (cur-db + add-db), sorted by (URL, date) ]
    curName  =  BASE_DIR + DB_NAME
    newName  =  BASE_DIR + NEW_DB_NAME
    addName  =  BASE_DIR + ADD_DB_NAME
    oldName  =  BASE_DIR + OLD_DB_NAME
    command  =  ( "sort "           # Use Unix sort
                  "-T /fs/tmp "     # Use honking big temp partition
####                  "-y10000 "        # Use 10M memory (not Linux)
                  "-S 10M "         # Use 10 megabytes of memory
                  "-t'%s' "         # Use FIELD_SEP as the field separator
                  "+3 "             # Primary key is the URL (4th field)
                  "+0 "             # Secondary key is the date
                  "%s %s "          # Names of cur-db and add-db
                  ">%s" %           # Name of new-db
                  ( FIELD_SEP, curName, addName, newName ) )
    try:
        if  os.path.exists ( newName ):
            os.remove ( newName )
    except OSError, detail:
        sys.stderr.write ( "Can't remove the new database `%s' "
                           "before the merge sort.\n%s\n" %
                           ( newName, detail ) )
        sys.exit(1)
    os.system ( command )

    #-- 3 --
    # [ old-db  :=  cur-db
    #   cur-db  :=  new-db
    #   new-db  :=  <deleted> ]
    try:
        if  os.path.exists ( oldName ):
            os.remove ( oldName )
    except OSError, detail:
        sys.stderr.write ( "Can't delete the old database file "
                           "after the merge sort.\n%s\n" % detail )
        sys.exit(1)

    try:
        os.rename ( curName, oldName )
    except OSError, detail:
        sys.stderr.write ( "Can't rename current -> old database file "
                           "after the merge sort.\n%s\n" % detail )
        sys.exit(1)

    try:
        os.rename ( newName, curName )
    except OSError, detail:
        sys.stderr.write ( "Can't rename new -> current database file "
                           "after the merge sort.\n%s\n" % detail )
        sys.exit(1)

    #-- 4 --
    return sums


# - - -   t r a n s l a t e L o g   - - -

def translateLog ( cutoffTime ):
    """Sum raw access logs into add-db in database format.

      [ if cutoffTime is an epoch time ->
          add-db  :=  valid access log records from <stdin>, minus any
                      before cutoffTime, summarized in daily-db-format
          return (# local accesses, # offsite accesses) from valid
          access log records from <stdin>, minus any before cutoffTime ]
    """

    #-- 1 --
    # [ sumNear  :=  0
    #   sumFar   :=  0
    #   urlMap   :=  a new, empty dictionary ]
    #   addFile  :=  a writeable file handle to a new, empty add-db
    sumNear  =  sumFar  =  0
    addName  =  BASE_DIR + ADD_DB_NAME
    urlMap   =  {}
    try:
        addFile  =  open ( addName, "w" )
    except IOError, detail:
        sys.stderr.write ( "Can't create the add-db, `%s'.\n" % addName )
        sys.exit(1)

    #-- 2 --
    #   <stdin>  :=   <stdin> advanced to EOF
    #   urlMap   +:=  entries mapping (url, yyyymmdd) |-> [nNear, nFar]
    #                 where url is the page's URL, yyyymmdd is the access date,
    #                 nNear is the sum of local accesses, and nFar is the sum
    #                 of off-site accesses, summarizing valid access log
    #                 records from <stdin> minus those before cutoffTime ]
    # NB: Ordinarily I'd create a new object type for the values in urlMap,
    # but memory may be limited.
    readNewLog ( cutoffTime, urlMap )

    #-- 3 --
    # [ addFile  +:=  database records from urlMap in daily-db-format ]
    keyList  =  urlMap.keys()
    for key in keyList:
        url, yyyymmdd  =  key
        nNear, nFar    =  urlMap[key]
        addFile.write ( "%s%s%d%s%d%s%s\n" %
                        ( yyyymmdd, FIELD_SEP, nNear, FIELD_SEP, nFar,
                          FIELD_SEP, url ) )
        sumNear  =  sumNear + nNear
        sumFar   =  sumFar + nFar

    #-- 4 --
    addFile.close()
    return (sumNear, sumFar)


# - - -   r e a d N e w L o g   - - -

def readNewLog ( cutoffTime, urlMap ):
    """Read access log records from <stdin> and sum them into urlMap.

      [ if (cutoffTime is an epoch time)
        and (urlMap is a dictionary mapping (url, yyyymmdd) |->
        [nNear, nFar] ->
          urlMap  :=  urlMap with access data added from valid log records
                      from <stdin> minus those before cutoffTime ]
    """

    #-- 1 --
    for  pageGet in scanAccessLog ( sys.stdin ):
        #-- 1 body --
        # [ if good-page-get ( pageGet ) ->
        #     urlMap  :=  urlMap with hits from pageGet added, with their
        #                 local/remote attribute set relative to SYM_DOMAIN
        #                 and IP_DOMAIN
        #   else -> I ]
        if  isGoodPageGet ( pageGet, cutoffTime ):
            addPageGet ( urlMap, pageGet )


# - - -   i s G o o d P a g e G e t   - - -

def isGoodPageGet ( pageGet, cutoffTime ):
    """Filter uninteresting accesses.

      [ if (pageGet is a PageGet object)
        and (cutoffTime is an epoch time) ->
          if good-page-get ( pageGet )
          and pageGet.when >= cutoffTime ->
            return 1
          else -> return 0 ]
    """

    #-- 1 --
    if  pageGet.status >= BAD_STATUS_THRESHOLD:
        return 0        # Ignore bad fetches, e.g., 403, 404

    #-- 2 --
    # [ ext  :=  extension from pageGet.url ]
    dontCare, ext  =  os.path.splitext ( pageGet.url )

    #-- 3 --
    # [ if ext, upshifted, is a key in IMAGE_EXTENSIONS ->
    #     return 0
    #   else -> I ]
    if  IMAGE_EXTENSIONS.has_key ( ext.upper() ):
        return 0

    #-- 4 --
    if  SPIDERS.has_key ( pageGet.accessor ):
        return 0

    #-- 5 --
    if  pageGet.username != "-":
        return 0

    #-- 6 --
    if  pageGet.when < cutoffTime:
        return 0
    else:
        return 1


# - - -   a d d P a g e G e t   - - -

def addPageGet ( urlMap, pageGet ):
    """Add one access to the database.

      [ if (urlMap is a dictionary mapping (url, yyyymmdd) |->
        [nNear, nFar])
        and (pageGet is a PageGet object) ->
          urlMap  :=  urlMap with hits from pageGet added, with their
                      local/remote attribute set relative to
                      SYM_DOMAIN and IP_DOMAIN ]
    """

    #-- 1 --
    # [ yyyymmdd  :=  pageGet.when, formatted as a date
    #   nNear     :=  0 if pageGet is local, else 1
    #   nFar      :=  1 if pageGet is local, else 0 ]
    yyyymmdd  =  time.strftime ( "%Y%m%d",
                                 time.localtime ( pageGet.when ) )
    nNear     =  pageGet.isLocal ( SYM_DOMAIN, IP_DOMAIN )
    nFar      =  1 - nNear

    #-- 2 --
    # [ if urlMap has a key (pageGet.url, yyyymmdd) ->
    #     urlMap[(pageGet.url, yyyymmdd)]  +:=  [nNear, nFar]
    #   else ->
    #     urlMap[(pageGet.url, yyyymmdd)]  :=  [nNear, nFar] ]
    key  =  (pageGet.url, yyyymmdd)
    try:
        pair  =  urlMap[key]
    except KeyError:
        pair  =  urlMap[key]  =  [0, 0]

    pair[0]  =  pair[0] + nNear
    pair[1]  =  pair[1] + nFar


# - - -   s u m D a t a b a s e   - - -

def sumDatabase():
    """Build sum-db by summing daily records in cur-db.

      [ if (cur-db is in daily-db-format and sorted by URL) ->
          sum-db  :=  records from cur-db summed by date to sum-db-format
                      and sorted by URL ]

      This is a classic serial algorithm: read and sum lines from cur-db,
      and whenever the URL changes, flush the old sum (if any) and start
      a new one.  Flush the old sum (if any) at the end.  The summaries
      are kept in sumNear and sumFar, and oldURL holds the previous URL
      (initially None).
    """

    #-- 1 --
    # [ f        :=  an open, readable file handle for cur-db, positioned
    #                after the first line if any, else at EOF
    #   sumf     :=  an open, writeable file handle for a new, empty sum-db
    #   line     :=  the first line from cur-db, or "" if it is empty
    #   oldURL   :=  None
    #   sumNear  :=  0
    #   sumFar   :=  0 ]
    f        =  open ( BASE_DIR + DB_NAME )
    sumf     =  open ( BASE_DIR + SUM_DB_NAME, "w" )
    line     =  f.readline()
    oldURL   =  None            # Holds previous URL
    sumNear  =  0               # Summarizes local hits
    sumFar   =  0               # Summarizes offsite hits

    #-- 2 --
    # [ sumf  +:=  records in sum-db-format summarizing clusters of lines
    #              from (line+f) with the same URL ]
    while  line != "":
        #-- 2 body --
        # [ if ((url from line) == oldURL)  ->
        #     sumNear  +:=  local hits count from line
        #     sumFar   +:=  offsite hits count from line
        #   else if ((url from line) != oldURL) and (oldURL is not None) ->
        #     sumf     +:=  a line in sum-db-format containing sumNear+sumFar,
        #                   sumFar, and oldURL
        #     sumNear  :=   local hits count from line
        #     sumFar   :=   offsite hit count from line
        #     oldURL   :=   URL from line
        #   else ->
        #     sumNear  :=   local hits count from line
        #     sumFar   :=   offsite hits count from line
        #     oldURL   :=   URL from line
        #   In any case ->
        #     line     :=   next line from f, or "" if at EOF ]

        #-- 2.1 --
        # [ yyyymmdd  :=  date from line
        #   nNear     :=  local hit count from line
        #   nFar      :=  offsite hit count from line
        #   url       :=  URL from line ]
        L         =  string.split ( string.rstrip ( line ), FIELD_SEP )
        yyyymmdd  =  L[0]
        nNear     =  string.atoi ( L[1] )
        nFar      =  string.atoi ( L[2] )
        url       =  L[3]
    
        #-- 2.2 --
        # [ if  (url != oldURL) and (oldURL is not None) ->
        #     sumf     +:=  a line in sum-db-format containing
        #                   sumNear+sumFar, sumFar, and oldURL ]
        #   In any case ->
        #     sumNear  :=   local hits count from line
        #     sumFar   :=   offsite hit count from line
        #     oldURL   :=   URL from line ]
        if  url != oldURL:
            sumFlush ( sumf, sumNear, sumFar, oldURL )
            sumNear  =  0
            sumFar   =  0
            oldURL   =  url

        #-- 2.3 --
        sumNear  =  sumNear + nNear
        sumFar   =  sumFar + nFar

        #-- 2.4 --
        # [ if f is at EOF ->
        #     line  :=  ""
        #   else ->
        #     f     :=  f advanced past the next line
        #     line  :=  next line from f ]
        line  =  f.readline()

    #-- 3 --
    # [ if  oldURL is None -> I
    #   else ->
    #     sumf     +:=  a line in sum-db-format containing
    #                   sumNear+sumFar, sumFar, and oldURL ]
    sumFlush ( sumf, sumNear, sumFar, oldURL )
    sumf.close()


# - - -   s u m F l u s h   - - -

def sumFlush ( sumf, sumNear, sumFar, url ):
    """Write a sum-db-format line to sumf

      [ if (sumf is a writeable file handle)
        and (sumNear is a count of local hits)
        and (sumFar is a count of offsite hits)
        and (url is a string) ->
          if  url is None -> I
          else ->
            sumf  +:=  a line in sum-db-format containing sumNear+sumFar,
        #                   sumFar, and oldURL ]
    """
    if  url:
        sumf.write ( "%d%s%d%s%s\n" %
                     ( sumNear + sumFar, FIELD_SEP, sumFar, FIELD_SEP,
                       url ) )


# - - -   w r i t e W e b   - - -

def writeWeb ( now, cutoffTime, nNear, nFar ):
    """Build the web pages summarizing the usage in the database.

      [ if cutoffTime is an epoch time ->
          report-web-files  :=  report-web ( cur-db, now, cutoffTime,
                                nNear, nFar ) ]
    """

    #-- 1 --
    # [ report-homepage-file  :=  report-homepage ( cur-db, now,
    #                                               nNear, nFar ) ]
    buildHomepage ( now, cutoffTime, nNear, nFar )

    #-- 2 --
    # [ report-by-url-files  :=  reports-by-url ( cur-db, now,
    #                                             nNear, nFar ) ]
    buildURLPage ( now, cutoffTime, nNear, nFar )

    #-- 3 --
    # [ report-by-hits-file  :=  report-by-hits ( cur-db, now,
    #                                             nNear, nFar ) ]
    buildHitsPage ( now, cutoffTime, nNear, nFar )


# - - -   b u i l d H o m e p a g e   - - -

def buildHomepage ( now, cutoffTime, nNear, nFar ):
    """Build the start page with stats and links to the other two pages

      [ home-page  :=  page giving overall stats and date ranges from
                       cur-db, and linking to by-hits-page and by-url-page ]
    """

    #-- 1 --
    # [ fileName  :=  name of homepage ]
    fileName  =  BASE_DIR + HOME_PAGE

    #-- 2 --
    # [ if fileName can be opened new for writing ->
    #     p  :=  a writeable file pointer to a new file named fileName ]
    p         =  open ( fileName, "w" )

    #-- 3 --
    # [ p  +:=  page header ]
    title  =  "TCC Web server statistics"
    p.write ( "<title>%s</title>\n" % title )
    p.write ( "<h1>%s</h1>\n" % title )

    #-- 4 --
    # [ p  +:=  summary of time ranges and total hits ]
    showSummary ( p, now, cutoffTime, nNear, nFar )

    #-- 5 --
    # [ p  +:=  links to URL_PAGE and HITS_PAGE ]
    p.write ( "\n" )
    p.write ( "<ul>\n" )
    p.write ( "  <li><a href=\"%s\">Access report by URL</a>\n" %
              URL_PAGE )
    p.write ( "  <li><a href=\"%s\">Access report by number of hits</a>\n" %
              HITS_PAGE )
    p.write ( "</ul>\n" )
    p.write ( "<hr>\n" )
    p.write ( "<p>For more information on these statistics, see the\n" )
    p.write ( '<a href="http://www.nmt.edu/tcc/projects/weblog2">weblog\n' )
    p.write ( "project documentation</a>.\n" )
    p.close ( )


# - - -   s h o w S u m m a r y   - - -

def showSummary ( p, now, cutoffTime, nNear, nFar ):
    """Add to page p a display of the time intervals and hit totals

      [ if (p is a writeable file handle)
        and (now is the current epoch time)
        and (cutoffTime is the cutoff time as an epoch time)
        and (nNear is the total number of local hits)
        and (nFar is the total number of offsite hits) ->
          p  +:=  lines showing the range of times (cutoffTime, now)
                  and the total hits (nNear, nFar) ]
    """
    newTuple  =  time.localtime ( now )
    p.write ( "<pre>\n" )
    p.write ( "This report generated:   %s\n" %
              time.strftime ( TIME_FORMAT, newTuple ) )

    oldTuple  =  time.localtime ( cutoffTime )
    p.write ( "Accesses since:          %s\n" %
              time.strftime ( TIME_FORMAT, oldTuple ) )

    p.write ( "Requests from off-campus: %9d\n" % nFar )
    p.write ( "Requests from on-campus:  %9d\n" % nNear )
    p.write ( "Total requests:           %9d\n" % (nFar + nNear) )
    p.write ( "</pre>\n" )

# - - -   b u i l d U R L P a g e   - - -

def buildURLPage ( now, cutoffTime, nNear, nFar ):
    """Build a start page showing 1st segments, then one page for each such.

      [ if (now and cutoffTime are epoch times)
        and (nNear and nFar are total hit counts for the site) ->
          report-by-url-files  :=  reports-by-url ( cur-db, now,
                                                    nNear, nFar ) ]
    """

    #-- 1 --
    # [ p  :=  a writeable file handle to a new, empty URL_PAGE ]
    p  =  open ( BASE_DIR + URL_PAGE, "w" )

    #-- 2 --
    # [ p  +:=  page header showing now, cutoffTime, nNear, nFar ]
    title  =  "TCC Web server statistics: By URL"
    p.write ( "<title>%s</title>\n" % title )
    p.write ( "<h1>%s</h1>\n" % title )
    showSummary ( p, now, cutoffTime, nNear, nFar )

    p.write ( "<p>To find the statistics for a page, select the link\n" )
    p.write ( "below that matches the first part of the page's URL.\n" )
    p.write ( "<pre>\n" )

    #-- 3 --
    # [ p              +:=  links to segment pages from cur-db
    #   segment-pages  :=   pages showing reports on URLs from cur-db,
    #                       one for each first-segment(those URLs) ]
    buildSegments ( p )

    #-- 4 --
    # [ p  +:=  page footer ]
    p.write ( "</pre>\n" )
    p.close ( )


# - - -   s h o w C o l u m n H e a d s   - - -

def showColumnHeads ( p ):
    """Add report column heads to page p

      [ if p is a writeable file handle ->
          p  +:=  column heads matching the output of showUsage() ]
    """
    p.write ( " Total  Offsite\n" )
    p.write ( "  Hits    Pct.\n"  )
    p.write ( "-------- -----\n"  )


# - - -   b u i l d S e g m e n t s   - - -

def buildSegments ( p ):
    """Break sum-db up by first-segment(url), write a page for each such

      [ if (p is a writeable file handle) ->
          p              +:=  links to segment pages from cur-db
          segment-pages  :=   pages showing reports on URLs from cur-db,
                              one for each first-segment(those URLs) ]
    """

    #-- 1 --
    # [ oldSeg   :=  None
    #   segFile  :=  None
    #   segNo    :=  0
    #   f        :=  a writeable file handle to sum-db
    #   line     :=  first line of sum-db, or "" if empty ]
    oldSeg   =  None        # Previous segment name
    segFile  =  None        # Writeable file handle of each segment file
    segNo    =  0
    f        =  open ( BASE_DIR + SUM_DB_NAME )
    line     =  f.readline()

    #-- 2 --
    # [ p              +:=  links to segment pages from (line+f)
    #   segment-pages  :=   pages showing reports on URLs from (line+f)
    #                       one for each first-segment(those URLs) ]
    while  line != "":
        #-- 2 body --
        # [ if  (segment from line's url) == oldSeg ->
        #     segFile  +:=  detail line made from line
        #   else if ((segment from line's url) != oldSeg)
        #   and (oldSeg is None) ->
        #     segFile  :=   writeable file pointer to a new segment file
        #                   number (segNo+1), containing a header and a
        #                   detail line made from line
        #     p        +:=  link to that new file, using the segment string
        #                   as the link text
        #   else ->
        #     write terminator to old segFile and close it
        #     segFile  :=   writeable file pointer to a new segment file
        #                   number (segNo+1), containing a header and a
        #                   detail line made from line
        #     segNo    :=   segNo + 1
        #     p        +:=  link to that new file, using the segment string
        #                   as the link text ]

        #-- 2.1 --
        # [ nHits  :=  total hits from line
        #   nFar   :=  offsite hits from line
        #   url    :=  URL from line
        #   seg    :=  first-segment(URL from line) ]
        # NB: Some urls don't have a leading slash, hence the special
        # case.
        L      =  string.split ( string.rstrip ( line ), FIELD_SEP )
        nHits  =  string.atoi ( L[0] )
        nFar   =  string.atoi ( L[1] )
        url    =  L[2]
        L  =  string.split ( url, "/" )
        if  len(L) < 2:   seg  =  ""      # Contains no slash
        else:             seg  =  L[1]    # Everything after first slash

        #-- 2.2 --
        # [ if (seg == oldSeg) -> I
        #   else if oldSeg is None ->
        #     segFile  :=   writeable file pointer to a new segment file
        #                   number (segNo+1), containing a header
        #     segNo    :=   segNo + 1
        #     p        +:=  link to that file, using seg as the link text ]
        #   else ->
        #     write trailer to segFile and close it
        #     segFile  :=   writeable file pointer to a new segment file
        #                   number (segNo+1), containing a header
        #     segNo    :=   segNo + 1
        #     p        +:=  link to that file, using seg as the link text
        #   In any case ->
        #     oldSeg   :=   seg ]
        (segNo, segFile)  =  checkSegChange ( oldSeg, seg, segFile, p, segNo )
        oldSeg            =  seg

        #-- 2.3 --
        # [ segFile  +:=  detail line made from (nHits, nFar, url) ]
        showUsage ( segFile, nHits, nFar, url )

        #-- 2.4 --
        # [ if f is at EOF ->
        #     line  :=  ""
        #   else ->
        #     f     :=  f advanced past next line 
        #     line  :=  next line from f ]
        line  =  f.readline()

    #-- 3 --
    # [ if segFile is None -> I
    #   else ->
    #     segFile  +:=  trailer ]
    #     close segFile ]
    if  segFile:
        segFileFinish ( segFile )


# - - -   s h o w U s a g e   - - -

def showUsage ( p, nHits, nFar, url ):
    """Prints one line showing hits on a given URL.

      [ if (p is a writeable file handle)
        and (nHits is the total hits on a given URL)
        and (nFar is the total offsite hits on that URL)
        and (url is that URL as a string) ->
          p  +:=  a line showing nHits, nFar as a percentage of nHits,
                  and url ]
    """
    p.write ( "%8d %5.1f %s\n" %
              ( nHits, ( 100.0 * nFar ) / nHits, url ) )


# - - -   c h e c k S e g C h a n g e   - - -

def checkSegChange ( oldSeg, seg, segFile, p, segNo ):
    """Handle the transition between segments in the by-URL pages.

      [ if (oldSeg is the previous segment name, or None if none)
        and (seg is the new segment name)
        and (segFile and p are writeable file handles)
        and (segNo is a nonnegative integer) ->
          if (seg == oldSeg) ->
            return (segNo, segFile)
          else if segFile is None ->
            segFile  :=   writeable file pointer to a new segment file
                          number (segNo+1), containing a header
            p        +:=  link to that file, using seg as the link text ]
            return (segNo+1, the new segFile)
          else ->
            write trailer to segFile and close it
            segFile  :=   writeable file pointer to a new segment file
                          number (segNo+1), containing a header
            p        +:=  link to that file, using seg as the link text
            return (segNo+1, the new segfile) ]
    """

    #-- 1 --
    if  seg == oldSeg:
        return (segNo, segFile )

    #-- 2 --
    # [ if segFile is None -> I
    #   else ->
    #     write trailer to segFile and close it ]
    if  segFile:
        segFileFinish ( segFile )

    #-- 3 --
    # [ segNo    :=   segNo + 1
    #   segName  :=   name of a new segment file number (segNo+1)
    #   segFile  :=   writeable file pointer to a new, empty file by
    #                 that name
    #   p        +:=  link to that file, using seg as the link text ]
    segNo      =  segNo + 1
    shortName  =  "%s%s%s" % ( SEG_DIR, SEG_NAME_FMT % segNo, HTML_SUFFIX )
    segName    =  BASE_DIR + shortName
    segFile    =  open ( segName, "w" )
    p.write ( '  <a href="%s">/%s</a>\n' %
              ( shortName, seg ) )

    #-- 4 --
    # [ segFile  +:=  a header showing segment name seg ]
    segFileStart ( segFile, seg )

    #-- 5 --
    return (segNo, segFile )


# - - -   s e g F i l e S t a r t   - - -

def segFileStart ( p, seg ):
    """Write the header part of a segment file

      [ if (p is a writeable file handle)
        and (seg is a segment name as a string) ->
          p  +:=  a header showing segment name seg ]
    """
    if  seg == "":
        title  =  ( "TCC Web Server statistics for the Tech homepage" )
    else:
        title  =  ( "TCC Web Server statistics for /%s/..." % seg )
    p.write ( "<title>%s</title>\n" % title )
    p.write ( "<h1>%s</h1>\n" % title )
    p.write ( "<pre>\n" )
    showColumnHeads ( p )


# - - -   s e g F i l e F i n i s h   - - -

def segFileFinish ( p ):
    """Write the footer part of a segment file and close it.

      [ if p is a writeable file handle ->
          p  +:=  HMTL trailer for a segment file
          close p ]
    """
    p.write ( "</pre>\n" )
    p.close()


# - - -   b u i l d H i t s P a g e   - - -

def buildHitsPage ( now, cutoffTime, nNear, nFar ):
    """Write a report showing pages by descending order of hit count.

      [ if (now and cutoffTime are epoch times)
        and (nNear and nFar are total hit counts for the site) ->
          report-by-hits-file  :=  report-by-hits ( cur-db, now,
                                   nNear, nFar ) ]
    """

    #-- 1 --
    # [ hits-db  :=  sum-db sorted by (nHits, url) ]
    #--
    # IMPORTANT NOTE!  The `sort' utility under RedHat 9 will not
    # sort properly unless environment variable LC_ALL is set to "C".
    # From `man sort':
    #   *** WARNING *** The locale specified by the environment affects
    #   sort order.  Set LC_ALL=C to get the traditional sort order
    #   that uses native byte values.
    #--
    os.putenv ( "LC_ALL", "C" )
    sumName   =  BASE_DIR + SUM_DB_NAME
    hitsName  =  BASE_DIR + HITS_DB_NAME
    command   =  ( "sort "              # Use Unix sort
                   "-T /fs/tmp "     # Use honking big temp partition
####                   "-y10000 "        # Memory size (obsolete)
                   "-S 10M "         # Use 10 megabytes of memory
                   "-t'%s' "         # Use FIELD_SEP as the field separator
                   "+0nr -1 "        # Primary key is number of hits, desc.
                   "+2 "             # Secondary key is the URL (3rd field)
                   "%s "             # Name of sum-db
                   ">%s" %           # Name of hits-db
                   ( FIELD_SEP, sumName, hitsName ) )
    try:
        if  os.path.exists ( hitsName ):
            os.remove ( hitsName )
    except OSError, detail:
        sys.stderr.write ( "Can't remove the hits db `%s' "
                           "before rewriting it.\n%s\n" %
                           ( hitsName, detail ) )
        sys.exit ( 1 )

    os.system ( command )

    #-- 2 --
    # [ p  :=  a writeable file handle to a new hits-db containing
    #          a header and a summary for (now, cutoffTime, nNear, nFar) ]
    p      =  open ( BASE_DIR + HITS_PAGE, "w" )
    title  =  "TCC Web server statistics: by hits"
    p.write ( "<title>%s</title>\n" % title )
    p.write ( "<h1>%s</h1>\n" % title )
    p.write ( "<p>Pages with fewer than %d hits are omitted.\n" %
              MIN_HITS )
    showSummary ( p, now, cutoffTime, nNear, nFar )
    p.write ( "<pre>\n\n" )
    showColumnHeads ( p )

    #-- 3 --
    # [ p  +:=  lines from records from hits-db in the same order,
    #           discarding lines with fewer than MIN_HITS total hits ]
    buildHitsLines ( p, hitsName )

    #-- 4 --
    p.write ( "</pre>\n" )
    p.close ()


# - - -   b u i l d H i t s L i n e s   - - -

def buildHitsLines ( p, hitsName ):
    """Format hits-db and write to page p, stopping after nHits < MIN_HITS

      [ if (p is a writeable file handle)
        and (hitsName names hits-db) ->
          p  +:=  lines from records from hits-db in the same order,
                  discarding lines with fewer than MIN_HITS total hits ]
    """

    #-- 1 --
    # [ f     :=  an open file handle for the file named by hitsName
    #   line  :=  the first line from that file, or "" if empty
    #   done  :=  0 ]
    f     =  open ( hitsName )
    line  =  f.readline()
    done  =  0

    #-- 2 --
    # [ p  +:=  lines from records in f in the same order,
    #           discarding lines with nHits < MIN_HITS ]
    while  ( ( line != "" ) and ( not done ) ):
        #-- 2 body --
        # [ if nHits from line < MIN_HITS ->
        #     done  :=  1
        #   else ->
        #     p  +:=  line, displayed as a record in sum-db-format ]

        #-- 2.1 --
        # [ nHits  :=  total hits field from line
        #   nFar   :=  offsite hits field from line
        #   url    :=  URL field from line ]
        L      =  string.split ( string.rstrip ( line ), FIELD_SEP )
        nHits  =  string.atoi ( L[0] )
        nFar   =  string.atoi ( L[1] )
        url    =  L[2]

        #-- 2.2 --
        # [ if nHits < MIN_HITS ->
        #     done  :=  1
        #   else ->
        #     p  +:=  line showing nHits, nFar as a percentage of nHits,
        #             and url ]
        if  nHits < MIN_HITS:
            done  =  1
        else:
            showUsage ( p, nHits, nFar, url )

        #-- 2.3 --
        # [ if f is at EOF ->
        #     line  :=  ""
        #   else ->
        #     f     :=  f advanced past next line
        #     line  :=  next line from f ]
        line  =  f.readline()



# - - - - -   w e b s t a t s . p y   - -   m a i n   - - - - -


sys.stderr.write("== %s %s ==\n" % (PRODUCT_NAME, EXTERNAL_VERSION))

# Overall intended function:
# [ if cur-db does not exist ->
#     cur-db  :=  a new database representing valid access log records
#                 from <stdin>, minus any over EXPIRE_DAYS old
#     report-web-files  :=  report-web ( that new database, now )
#   else ->
#     cur-db  :=  a new database representing (cur-db + valid access log
#                 records from <stdin>), minus any over EXPIRE_DAYS old
#     report-web-files  :=  report-web ( that new database, now ) ]

#-- 1 --
# [ now         :=  the current time as an epoch time
#   cutoffTime  :=  the beginning of the day EXPIRE_DAYS before now,
#                   as an epoch time in seconds ]
now            =  time.time()
todayTuple     =  time.localtime ( now )
todayTuple     =  map ( None, todayTuple )   # tuple -> list
todayTuple[3]  =  0
todayTuple[4]  =  0
todayTuple[5]  =  0
todayTuple     =  tuple ( todayTuple )
cutoffTime     =  time.mktime ( todayTuple ) - EXPIRE_SEC

#-- 2 --
# [ if cur-db does not not exist ->
#     cur-db  :=  a new database representing valid access log records
#                 from <stdin>, minus any before cutoffTime
#     nNear   :=  number of local accesses from that new database
#     nFar    :=  number of offsite accesses from that new database
#   else ->
#     cur-db  :=  a new database representing (cur-db + valid access log
#                 records from <stdin>), minus any before cutoffTime
#     nNear   :=  number of local accesses from that new database
#     nFar    :=  number of offsite accesses from that new database ]
nNear, nFar  =  updateDatabase ( cutoffTime )

#-- 3 --
# [ sum-db  :=  records from cur-db summed by date to sum-db-format
#               and sorted by URL ]
sumDatabase()

#-- 4 --
# [ report-web-files  :=  report-web ( sum-db ) ]
writeWeb ( now, cutoffTime, nNear, nFar )

TCC home: TCC home
NMT home: NMT home

Last updated: 2009-11-22 13:24 MST