#!/usr/bin/env python #================================================================ # webstats.py: NM Tech Computer Center web statistics # # For documentation, see: # http://www.nmt.edu/tcc/projects/tccwebstats/ Specification # http://www.nmt.edu/tcc/projects/tccwebstats/ims/ Internals #---------------------------------------------------------------- PRODUCT_NAME = "tccwebstats" EXTERNAL_VERSION = "4.0" DATE_FORMAT = "%Y-%m-%dT%H:%M:%S %z" #================================================================ # Imports #---------------------------------------------------------------- import sys import os import math import datetime from etbuilder import E, et import tccpage2 from pageget import * #================================================================ # Manifest constants #---------------------------------------------------------------- EXPIRE_DAYS = 30 ACCESS_LOGS_DIR = "/var/log/httpd/" ACCESS_LOG_PATH = ACCESS_LOGS_DIR + "access_log" DAYS_PER_LOG = 7 N_OLDER_LOGS = int ( math.ceil ( float ( EXPIRE_DAYS ) / DAYS_PER_LOG ) ) HITS_CUTOFF = 10 # - - - - - c l a s s W e b P a t h class WebPath: '''Represents an absolute path accessible via the Web. Exports: WebPath ( url, absPath ): [ (url is a URL as a string) and (absPath is the equivalent absolute path as a string) -> return a new WebPath instance with those values ] .url: [ as passed to constructor, read-only ] .absPath: [ as passed to constructor, read-only ] .relative(relPath): [ relPath is a relative path as a string -> return a new WebPath instance representing relPath relative to self ] ''' # - - - W e b P a t h . _ _ i n i t _ _ def __init__ ( self, url, absPath ): '''Constructor ''' self.url = url self.absPath = absPath # - - - W e b P a t h . r e l a t i v e def relative ( self, relPath ): '''Return a new WebPath relative to self. ''' return WebPath ( os.path.join ( self.url, relPath ), os.path.join ( self.absPath, relPath ) ) NMT_WEB_PATH = WebPath ( "http://www.nmt.edu/", "/u/www/docs/" ) TCC_WEB_PATH = NMT_WEB_PATH.relative ( "tcc/" ) OUT_WEB_PATH = TCC_WEB_PATH.relative ( "webstats/" ) PERSONAL_WEB_PATH = OUT_WEB_PATH.relative ( "p/" ) OFFICIAL_WEB_PATH = OUT_WEB_PATH.relative ( "o/" ) HTML_EXT = ".html" INDEX_WEB_PATH = OUT_WEB_PATH.relative ( "index" + HTML_EXT ) BY_HITS_WEB_PATH = OUT_WEB_PATH.relative ( "byhits" + HTML_EXT ) LETTER_PREFIX = "pers-" INSTITUTE_HOMEPAGE = '/' TABLE_ATTRS = { 'cellpadding': '3', 'cellspacing': '3', 'border': '3' } L_ALIGN = { 'align': 'left' } R_ALIGN = { 'align': 'right' } LEAF_NAV_LINKS = [ tccpage2.NavLink ( "Next", [] ), tccpage2.NavLink ( "Previous", [] ), tccpage2.NavLink ( "TCC Web server statistics", [ ("TCC Web server statistics", INDEX_WEB_PATH.url) ] ), tccpage2.NavLink ( "Tech Computer Center", [ ("Tech Computer Center", TCC_WEB_PATH.url) ] ) ] INDEX_PAGE_TITLE = "TCC Web server statistics" BY_HITS_TITLE = "TCC Web server statistics: hit parade" LETTER_PAGE_TITLE = 'Personal pages starting with "%s~%s"' ACCESS_REPORT_TITLE = 'Access report for %s%s%s' # - - - - - w e b s t a t s . p y - - m a i n def main(): '''Generate reports on Web page access at this Apache server. [ access-logs are readable -> index-page := index-content(access-logs) hit-parade-page := hits-content(access-logs) letter-pages(access-logs) := letter-content(access-logs) personal-reports(access-logs) := personal-content(access-logs) official-reports(access-logs) := official-content(access-logs) sys.stderr +:= (version-greeting) + (error messages about invalid lines in access-logs, if any) ] ''' #================================================================ # Verification functions #---------------------------------------------------------------- # access-logs == Apache access_log file in /var/log/httpd # and any files named "access_log.[1-5]" in that directory #---------------------------------------------------------------- # index-page == a page at INDEX_WEB_PATH #---------------------------------------------------------------- # index-content(logs) == # (summary of total hits in logs) + # (link to hit-parade-page) + # (official-report for "/" in logs) + # (links to letter-pages(logs)) + # (links to official-reports(logs)) #---------------------------------------------------------------- # hit-parade-page == a page at HITS_WEB_PATH #---------------------------------------------------------------- # hits-content(logs) == # a report showing all pages with at least HITS_CUTOFF hits in # logs, in descending order by total hits, with URL as a # secondary key #---------------------------------------------------------------- # letter-pages(logs) == # pages at (LETTER_PREFIX + c + HTML_EXT) relative to # OUT_WEB_PATH, one for each unique first letter (c) in the # personal page hits from logs #---------------------------------------------------------------- # letter-content(logs) == # for each unique first letter (c) in the personal page hits # from logs, links to the personal-reports for pages whose # users start with (c) #---------------------------------------------------------------- # personal-reports(logs) == # pages at (username + HTML_EXT) relative to # PERSONAL_WEB_PATH for every personal username found in logs #---------------------------------------------------------------- # personal-content(logs) == # one report for each personal account appearing in logs, # showing all URLs for that account in ascending order by URL #---------------------------------------------------------------- # official-reports(logs) == # pages at (dirname + HTML_EXT) relative to # OFFICIAL_WEB_PATH for every official directory name found # in logs #---------------------------------------------------------------- # official-content(logs) == # one report for each official directory appearing in logs, # showing all URLs for that account in ascending order by URL #---------------------------------------------------------------- #-- 1 -- # [ sys.stderr +:= greeting message and timestamp # now := the current time as a datetime.datetime # cutoffTime := a time EXPIRE_DAYS days before the # current time as a datetime.datetime ] message ( "== %s %s == %s +0000\n" % ( PRODUCT_NAME, EXTERNAL_VERSION, datetime.datetime.utcnow().strftime ( DATE_FORMAT ) ) ) utcZone = FixedTimeZone(0, "UTC") thirtyDays = datetime.timedelta( EXPIRE_DAYS ) now = datetime.datetime.now(utcZone) cutoffTime = now - thirtyDays #-- 2 -- # [ access-logs are readable -> # accessSummary := a new AccessSummary instance # containing all the relevant records from those logs ] accessSummary = inputPhase(cutoffTime, now) #-- 3 -- # [ indexPage := a new tccpage2.TCCPage instance with a "TCC # Computer Center" navigational link ] navList = [ tccpage2.NavLink ( "Next", [] ), tccpage2.NavLink ( "Previous", [] ), tccpage2.NavLink ( "Tech Computer Center", [ ("Tech Computer Center", TCC_WEB_PATH.url) ] ) ] indexPage = tccpage2.TCCPage ( INDEX_PAGE_TITLE, navList, url=INDEX_WEB_PATH.url ) #-- 4 -- # [ indexPage +:= index-content(accessSummary) # hit-parade-page := hits-content(accessSummary) # letter-pages(accessSummary) := letter-content(accessSummary) # personal-reports(accessSummary) := # personal-content(accessSummary) # official-reports(accessSummary) := # official-reports(accessSummary) ] buildAllPages ( indexPage, accessSummary ) #-- 5 -- # [ file INDEX_WEB_PATH := indexPage, serialized as XHTML ] try: indexFile = open ( INDEX_WEB_PATH.absPath, "w" ) except IOError, detail: fatal ( "Can't open the index page '%s': %s" % (INDEX_ABS_PATH.absPath, detail) ) indexPage.write ( indexFile ) indexFile.close() # - - - i n p u t P h a s e def inputPhase(cutoffTime, now): '''Read all the access logs and return an AccessSummary. [ (access-logs are readable) and (cutoffTime is the start of the report interval as a datetime.datetime) and (now is the end of the report interval as a datetime.datetime) -> return a new AccessSummary containing all the relevant records from those logs ] ''' #-- 1 -- # [ accessSummary := an AccessSummary for the interval # between cutoffTime and now ] accessSummary = AccessSummary ( cutoffTime, now ) #-- 2 -- # [ if the file at ACCESS_LOG_PATH is readable -> # accessSummary := accessSummary with valid records # added from that file # sys.stderr +:= messages about lines in that file # that aren't valid, if any # else -> # sys.stderr +:= error message # stop execution ] try: readLogFile ( accessSummary, ACCESS_LOG_PATH ) except IOError, detail: fatal ( "Can't read the current access log file '%s': %s" % (ACCESS_LOG_PATH, detail) ) #-- 3 -- # [ if any of the N_OLDER_LOGS files whose names are # (ACCESS_LOG_PATH+"."+n), n="1", "2", ... are readable -> # accessSummary := accessSummary with valid records # added from those files # sys.stderr +:= messages about lines in that file # that aren't valid, if any ] for logNo in range(1, N_OLDER_LOGS + 1): logName = "%s.%d" % (ACCESS_LOG_PATH, logNo) try: readLogFile ( accessSummary, logName ) except IOError,detail: pass #-- 4 -- return accessSummary # - - - r e a d L o g F i l e def readLogFile ( accessSummary, fileName ): '''Process one access log. [ (accessSummary is an AccessSummary instance) and (fileName is a string) -> if fileName names a readable file -> accessSummary := accessSummary with relevant records added from that file sys.stderr +:= error messages about invalid lines, if any else -> raise IOError ] ''' #-- 1 -- # [ if fileName names a readable file -> # inFile := that file, so opened # else -> raise IOError ] inFile = open ( fileName ) #-- 2 -- # [ accessSummary := accessSummary + (records from inFile # that are valid and pass AccessSummary's filters) # sys.stderr +:= error messages about invalid lines # from inFile, if any ] for pageGet in scanAccessLog ( inFile ): accessSummary.addPageGet ( pageGet ) inFile.close() # - - - b u i l d A l l P a g e s def buildAllPages ( indexPage, accessSummary ): '''Write all output pages. [ (indexPage is a tccpage2.TCCPage instance) and (accessSummary is an AccessSummary instance) -> indexPage +:= index-content(accessSummary) hit-parade-page := hits-content(accessSummary) letter-pages(accessSummary) := letter-content(accessSummary) personal-reports(accessSummary) := personal-content(accessSummary) official-reports(accessSummary) := official-reports(accessSummary) ] ''' #-- 1 -- # [ indexPage +:= (summary of total hits in accessSummary) ] addSummaryTable ( indexPage.content, accessSummary ) #-- 2 -- # [ indexPage +:= (link to hit-parade-page made from # accessSummary) # hit-parade-page := hits-content(accessSummary) ] buildHitParade ( indexPage.content, accessSummary ) #-- 3 -- # [ indexPage +:= (official-content for "/" from accessSummary) ] instituteHomepage ( indexPage.content, accessSummary ) #-- 4 -- # [ indexPage +:= (links to letter-pages(accessSummary)) + # (links to official-reports(accessSummary)) # letter-pages(accessSummary) := letter-content(accessSummary) # personal-reports(accessSummary) := # personal-content(accessSummary) # official-reports(accessSummary) := # official-content(accessSummary) ] buildCategoryTable ( indexPage.content, accessSummary ) #-- 5 -- # [ indexPage +:= link to the specification ] indexPage.content.append ( E.p ( "For documentation describing how these pages " "are built, see ", E.a ( "the specification.", href="http://www.nmt.edu/tcc/projects/tccwebstats4/" ) ) ) # - - - a d d S u m m a r y T a b l e def addSummaryTable ( parent, accessSummary ): '''Add the overall summary of accesses. [ (parent is an et.Element) and (accessSummary is an AccessSummary) -> parent +:= (summary of total hits in accessSummary) ] ''' #-- 1 -- # [ parent +:= a new 'table' element with attributes TABLE_ATTRS # table := that element ] table = E.table ( TABLE_ATTRS ) parent.append ( table ) #-- 2 -- # [ table +:= row showing the current time from accessSummary.now ] timeFormat = "%Y-%m-%dT%H:%M %z" table.append ( E.tr ( E.th ( L_ALIGN, "This report generated" ), E.td ( R_ALIGN, accessSummary.now.strftime ( timeFormat ) ) ) ) #-- 3 -- # [ table +:= row showing start time accessSummary.oldestHit ] utcZone = FixedTimeZone(0, "UTC") oldestZulu = accessSummary.oldestHit.astimezone(utcZone) table.append ( E.tr ( E.th ( L_ALIGN, "Accesses since" ), E.td ( R_ALIGN, oldestZulu.strftime ( timeFormat ) ) ) ) #-- 4 -- # [ table +:= rows showing total off-campus hits, total # on-campus hits, and total hits from accessSummary ] nFar = accessSummary.sumHitCount.nFar nTotal = accessSummary.sumHitCount.nTotal table.append ( E.tr ( E.th ( L_ALIGN, "Requests from off-campus" ), E.td ( R_ALIGN, str(nFar) ) ) ) table.append ( E.tr ( E.th ( L_ALIGN, "Requests from on-campus" ), E.td ( R_ALIGN, str(nTotal-nFar) ) ) ) table.append ( E.tr ( E.th ( L_ALIGN, "Total requests" ), E.td ( R_ALIGN, str(nTotal) ) ) ) # - - - b u i l d H i t P a r a d e def buildHitParade ( parent, accessSummary ): '''Build the hit parade page and the index page link to it. [ (parent is an et.Element) and (accessSummary is an AccessSummary) -> indexPage +:= (link to hit-parade-page made from accessSummary) hit-parade-page := hits-content(accessSummary) ] ''' #-- 1 -- # [ parent +:= a new paragraph containing a link to # BY_HITS_WEB_PATH ] parent.append ( E.p ( E.a ( "Access report by number of hits", href=BY_HITS_WEB_PATH.url ) ) ) #-- 2 -- # [ hitsPage := a new tccpage2.TCCPage instance ] hitsPage = tccpage2.TCCPage ( BY_HITS_TITLE, LEAF_NAV_LINKS, url=BY_HITS_WEB_PATH.url ) #-- 3 -- # [ hitsPage +:= a new access report as a 'table' element # tbody := the 'tbody' element of that table ] tbody = accessReport ( hitsPage.content ) #-- 4 -- # [ tbody +:= access report rows made from HitCount # instances in accessSummary, sorted according to # HitCount.__cmp__ ] for hitCount in accessSummary.genByHits(): if hitCount.nTotal < HITS_CUTOFF: break accessRow ( tbody, hitCount ) #-- 5 -- # [ if BY_HITS_WEB_PATH can be opened new for writing -> # that file +:= hitsPage, serialized as XHTML # else -> # sys.stderr +:= error message # stop execution ] try: hitsFile = open ( BY_HITS_WEB_PATH.absPath, 'w' ) except IOError, detail: fatal ( "Can't open the hit-parade page '%s': %s" % (BY_HITS_WEB_PATH.absPath, detail) ) hitsPage.write ( hitsFile ) hitsFile.close() # - - - a c c e s s R e p o r t def accessReport ( parent ): '''Build a new access report [ parent is an et.Element -> parent +:= a new access report as a 'table' element return the 'tbody' element of that table ] ''' #-- 1 -- # [ parent +:= a new, empty 'tbody' element ] # tbody := that element ] tbody = E.tbody() #-- 2 -- # [ table := a new 'table' element with three columns, # headings for an access report, and tbody as its # 'tbody' element ] table = E.table ( TABLE_ATTRS, E.col ( R_ALIGN ), E.col ( R_ALIGN ), E.col ( L_ALIGN ), E.thead ( E.th ( "Total" ), E.th ( "Offsite" ), E.th ( "URL" ) ), tbody ) parent.append ( table ) #-- 3 -- return tbody # - - - a c c e s s R o w def accessRow ( tbody, hitCount ): '''Add one row to an access table. [ (tbody is an et.Element) and (hitCount is a HitCount instance) -> tbody +:= an access report row displaying hitCount ] ''' #-- 1 -- # [ pct := percentage off-campus from hitCount as a string ] pct = "%.1f%%" % ( ( 100.0 * hitCount.nFar ) / hitCount.nTotal ) #-- 2 -- # [ tbody +:= a 'tr' element displaying three values: # hitCount.nTotal, pct, and a link to hitCount.url ] pageUrl = NMT_WEB_PATH.relative(hitCount.url[1:]).url tbody.append ( E.tr ( E.td ( R_ALIGN, str(hitCount.nTotal) ), E.td ( R_ALIGN, pct ), E.td ( L_ALIGN, E.a ( hitCount.url, href=pageUrl ) ) ) ) # - - - i n s t i t u t e H o m e p a g e def instituteHomepage ( parent, accessSummary ): '''Build the access report for INSTITUTE_HOMEPAGE. [ (parent is an et.Element) and (accessSummary is an AccessSummary) -> parent +:= (official-content for "/" from accessSummary) ] ''' #-- 1 -- # [ if accessSummary has any access for INSTITUTE_HOMEPAGE -> # hitCount := its access counts as a HitCount # else -> # hitCount := a new HitCount showing zero accesses ] try: hitCount = accessSummary.getUrl ( INSTITUTE_HOMEPAGE ) except KeyError: hitCount = HitCount ( INSTITUTE_HOMEPAGE ) #-- 2 -- # [ parent +:= heading "Access report for the NMT homepage" ] parent.append ( E.p ( "Access report for the NMT homepage" ) ) #-- 3 -- # [ parent +:= a new access report as a 'table' # tbody := the 'tbody' element of that table ] tbody = accessReport ( parent ) #-- 4 -- # [ tbody +:= an access report row displaying hitCount ] accessRow ( tbody, hitCount ) # - - - c a t e g o r y T a b l e def buildCategoryTable ( parent, accessSummary ): '''Build all the personal/official reports and their indices. [ (parent is an et.Element) and (accessSummary is an AccessSummary) -> parent +:= (links to letter-pages(accessSummary)) + (links to official-reports(accessSummary)) letter-pages(accessSummary) := letter-content(accessSummary) personal-reports(accessSummary) := personal-content(accessSummary) official-reports(accessSummary) := official-content(accessSummary) ] ''' #-- 1 -- # [ parent +:= a new 'table' element for the category report # personalCell := the 'td' element for personal letter # index links # officialCell := the 'td' element for official letter # index links ] table = E.table ( TABLE_ATTRS, E.thead ( E.tr ( E.th ( "Personal pages starting with ", E.tt ( '"%s~"' % NMT_WEB_PATH.url ) ), E.th ( "Official pages starting with ", E.tt ( '"%s"' % NMT_WEB_PATH.url ) ) ) ) ) parent.append ( table ) tbody = et.SubElement ( table, 'tbody' ) tr = et.SubElement ( tbody, 'tr' ) personalCell = et.SubElement ( tr, 'td', valign='top' ) officialCell = et.SubElement ( tr, 'td', valign='top' ) #-- 2 -- # [ personalCell +:= links to letter-pages(accessSummary) # letter-pages(accessSummary) := letter-content(accessSummary) # personal-reports(accessSummary) := # personal-content(accessSummary) ] buildPersonalSide ( personalCell, accessSummary ) #-- 3 -- # [ officialCell +:= links to official-reports(accessSummary) # official-reports(accessSummary) := # official-content(accessSummary) ] buildOfficialSide ( officialCell, accessSummary ) # - - - b u i l d P e r s o n a l S i d e def buildPersonalSide ( cell, accessSummary ): '''Add links to letter pages, and build letter and personal pages. [ (cell is an et.Element) and (accessSummary is an AccessSummary instance) -> cell +:= links to letter-pages(accessSummary) letter-pages(accessSummary) := letter-content(accessSummary) personal-reports(accessSummary) := personal-content(accessSummary) ] ''' #-- 1 -- for letter in accessSummary.genPersonalLetters(): #-- 1 body -- # [ cell +:= link to a letter-page for (letter) # letter-page for (letter) := letter-content for (letter) ] # personal-reports for accounts in accessSummary that start # with (letter) := personal-content for those accounts ] #-- 1.1 -- # [ letterWebPath := WebPath for letter-page for (letter) ] relPath = "%s%s%s" % ( LETTER_PREFIX, letter, HTML_EXT ) letterWebPath = OUT_WEB_PATH.relative ( relPath ) #-- 1.2 -- # [ cell +:= link to letterWebPath.url using link text # (NMT_WEB_PATH + "~" + letter + "...") ] linkText = "%s~%s..." % (NMT_WEB_PATH.url, letter) cell.append ( E.div ( E.a ( linkText, href=letterWebPath.url ) ) ) #-- 1.3 -- # [ letter-page for (letter) from accessSummary := # letter-content for (letter) # personal-reports for accounts in accessSummary that start # with (letter) := personal-content for those accounts ] buildLetter ( letter, letterWebPath, accessSummary ) # - - - b u i l d L e t t e r def buildLetter ( letter, letterWebPath, accessSummary ): '''Build one letter page and all its personal pages. [ (letter is a one-character string) and (letterWebPath is a WebPath instance) and (accessSummary is an Accesssummary instance) -> letter-page for (letter) from accessSummary := letter-content for (letter) personal-reports for accounts in accessSummary that start with (letter) := personal-content for those accounts ] ''' #-- 1 -- # [ letterPage := a new tccpage2.TCCPage instance using # the URL from letterWebPath ] pageTitle = LETTER_PAGE_TITLE % (NMT_WEB_PATH.url, letter) letterPage = tccpage2.TCCPage ( pageTitle, LEAF_NAV_LINKS, url=letterWebPath.url ) #-- 2 -- # [ letterPage.content +:= a new 'ul' element ] ul = et.SubElement ( letterPage.content, 'ul' ) #-- 3 -- # [ ul +:= 'li' elements containing links to personal-report # pages for each personal name in accessSummary # that starts with (letter) # personal-reports for accounts in accessSummary that start # with (letter) := personal-content for those accounts ] for userName in accessSummary.genPersonals(letter): #-- 3 body -- # [ ul +:= an 'li' element containing a link to a # personal-report page for (userName) # personal-report for (userName) := personal-content # for (userName) ] addPersonalReport ( ul, userName, accessSummary ) #-- 4 -- # [ file letterWebPath.absPath := letterPage, serialized ] try: pageFile = open ( letterWebPath.absPath, 'w' ) except IOError, detail: fatal ( "Can't open letter page '%s': %s." % (letterWebPath.absPath, detail) ) letterPage.write ( pageFile ) pageFile.close() # - - - a d d P e r s o n a l R e p o r t def addPersonalReport ( ul, userName, accessSummary ): '''Build the access report page for one person's account. [ (ul is an et.Element) and (userName is a personal account name) and (accessSummary is an AccessSummary instance) -> ul +:= an 'li' element containing a link to a personal-report page for (userName) personal-report for (userName) := personal-content for (userName) ] ''' #-- 1 -- # [ webPath := a WebPath instance representing userName's # page relative to PERSONAL_WEB_PATH ] fileName = userName + HTML_EXT webPath = PERSONAL_WEB_PATH.relative ( fileName ) #-- 2 -- # [ ul +:= an 'li' element containing a link to webPath ] linkText = "%s~%s" % (NMT_WEB_PATH.url, userName) ul.append ( E.li ( E.a ( linkText, href=webPath.url ) ) ) #-- 3 -- # [ personal-report for (userName) := personal-content # for (userName) from accessSummary at webPath ] buildReportPage ( userName, webPath, "~", accessSummary, accessSummary.genPersonUrls ) # - - - b u i l d R e p o r t P a g e def buildReportPage ( userName, webPath, tilde, accessSummary, genUrls ): '''Build the access report for one personal web. [ (userName is a user name as a string) and (webPath is a WebPath instance) and (tilde is "~" for personal pages, "" for official) and (accessSummary is an AccessSummary instance) and (genUrls is a bound method that generates all the URLs for a given user or directory name) -> personal-report for (userName) := personal-content for (userName) from accessSummary at webPath ] ''' #-- 1 -- # [ page := a new tccpage2.TCCPage instance with title # ACCESS_REPORT_TITLE and navigation list LEAF_NAV_LINKS # at path webPath ] pageTitle = ( ACCESS_REPORT_TITLE % (NMT_WEB_PATH.url, tilde, userName) ) page = tccpage2.TCCPage ( pageTitle, LEAF_NAV_LINKS, url=webPath.url ) #-- 2 -- # [ page.content +:= a new access report as a 'table' element # tbody := the 'tbody' element of that table ] tbody = accessReport ( page.content ) #-- 3 -- # [ tbody +:= rows showing the hit counts for URLs # generated by genUrls() ] for url in genUrls(userName): #-- 3 body -- # [ tbody +:= a row showing the hit counts for URL # from accessSummary ] hitCount = accessSummary.getUrl ( url ) accessRow ( tbody, hitCount ) #-- 4 -- # [ file webPath.absPath := page, serialized ] try: pageFile = open ( webPath.absPath, 'w' ) except IOError, detail: fatal ( "Can't open access report page '%s': %s" % (webPath.absPath, detail) ) page.write ( pageFile ) pageFile.close() # - - - b u i l d O f f i c i a l S i d e def buildOfficialSide ( cell, accessSummary ): '''Build all official access reports. [ (cell is an et.Element) and (accessSummary is an AccessSummary instance) -> cell +:= links to official-reports(accessSummary) ] official-reports(accessSummary) := official-content(accessSummary) ] ''' #-- 1 -- for dirName in accessSummary.genOfficials(): #-- 1 body -- # [ cell +:= link to official-report for dirName # official-report for dirName := official-content # for dirName from accessSummary ] #-- 1.1 -- # [ officialWebPath := WebPath for official-report # for (dirName) ] relPath = dirName + HTML_EXT officialWebPath = OFFICIAL_WEB_PATH.relative ( relPath ) #-- 1.2 -- # [ cell +:= link to officialWebPath using link text # (NMT_WEB_PATH + dirName + "/") ] linkText = "%s%s/..." % (NMT_WEB_PATH.url, dirName) cell.append ( E.div ( E.a ( linkText, href=officialWebPath.url ) ) ) #-- 1.3 -- # [ official-report for dirName := official-content # for dirName from accessSummary ] buildReportPage ( dirName, officialWebPath, "", accessSummary, accessSummary.genOfficialUrls ) # - - - f a t a l - - - def fatal ( *L ): """Write an error message and terminate. [ L is a list of strings -> sys.stderr +:= elements of L, concatenated stop execution ] """ error ( "Fatal error: ", "".join(L) ) sys.exit ( 1 ) # - - - - - c l a s s A c c e s s S u m m a r y class AccessSummary: '''Container for all report summary data. Exports: AccessSummary(cutoffTime, now): [ (cutoffTime is the beginning of the report interval as a datetime.datetime) and (now is the end of the report interval as a datetime.datetime) -> return a new, empty AccessSummary instance for that interval ] .cutoffTime: [ the time self.EXPIRE_DAYS in the past as a datetime.datetime ] .now: [ the time of instantiation as a datetime.datetime ] .oldestHit: [ the oldest timestamp observed in any access record, as a datetime.datetime instance ] .sumHitCount: [ a HitCount instance giving the overall total and remote hit counts for all accesses in self ] .addPageGet ( self, pageGet ): [ pageGet is a PageGet instance -> if (pageGet is not older than self.cutoffTime) and (pageGet is relevant by all filtering criteria) -> self := self with that access added ] .getUrl ( url ): [ url is a URL as a string -> if self has any accesses for url -> return its access counts as a HitCount instance else -> raise KeyError ] .genByHits(): [ generate the HitCount instances in self sorted according to HitCount.__cmp__() ] .genPersonalLetters(): [ generate the sequence of initial letters of personal account names in ascending order as a sequence of strings ] .genPersonals(letter): [ letter is a 1-character string -> generate all the personal account names in self that start with letter ] .genOfficials(): [ generate all the official directory names in self in ascending order ] .genPersonUrls(person): [ person is a TCC account name -> generate the URLs in self for this person as a sequence of strings ] .genOfficialUrls(dir): [ dir is an official directory name -> generate the URLs in self for this directory as a sequence of strings ] State/Invariants: .__urlMap: [ a dictionary whose keys are all the URLs in self and each corresponding value is a HitCount instance summarizing the hits on that URL in self's reporting period ] .__personalLetterMap: [ a dictionary whose keys are the first characters of personal directories in self, and each corresponding value is a set of the personal directories in self that have that first character ] .__personalMap: [ a dictionary whose keys are the names of personal directories in self, and each corresponding value is a set of the URLs for that person ] .__officialMap: [ a dictionary whose keys are the names of official directories in self, and each corresponding value is a set of the URLs for that directory ] ''' EXPIRE_DAYS = 30 SYM_DOMAIN = ["nmt", "edu"] IP_DOMAIN = ["129", "138"] BAD_STATUS_THRESHOLD = 300 IGNORED_EXTENSIONS = set ( [ ".bmp", ".css", ".g", ".gif", ".ico", ".jpg", ".jpeg", ".png", ".swf", ".tif", ".tiff" ] ) SPIDER_STRINGS = ("crawl", "msnbot-", "monitor.nmt.edu") # - - - A c c e s s S u m m a r y . _ _ i n i t _ _ def __init__ ( self, cutoffTime, now ): '''Constructor for AccessSummary. ''' #-- 1 -- # [ self.cutoffTime := cutoffTime # self.now, self.oldestHit := now # self.sumHitCount := a new, empty HitCount ] self.cutoffTime = cutoffTime self.now = self.oldestHit = now self.sumHitCount = HitCount('') #-- 2 -- self.__urlMap = {} self.__personalLetterMap = {} self.__personalMap = {} self.__officialMap = {} # - - - A c c e s s S u m m a r y . a d d P a g e G e t def addPageGet ( self, pageGet ): '''Filter and possibly add one access record. ''' #-- 1 -- # [ if pageGet passes all self's filters -> # self := self with pageGet added # else -> I ] if self.__isRelevant ( pageGet ): self.__addHit ( pageGet.url, pageGet.when, pageGet.isFar ( self.SYM_DOMAIN, self.IP_DOMAIN ) ) # - - - A c c e s s S u m m a r y . _ _ i s R e l e v a n t def __isRelevant ( self, pageGet ): '''Filtering for access records. [ pageGet is a PageGet instance -> if pageGet passes all the filters in self.FILTER_FUNCTIONS -> return True else -> return False ] ''' #-- 1 -- # [ if any function in self.FILTER_FUNCTIONS, # operating on pageGet, returns False -> # return False # else -> I ] for f in self.FILTER_FUNCTIONS: if not f(self, pageGet): return False #-- 2 -- return True # - - - A c c e s s S u m m a r y . _ _ s t a t u s F i l t e r def __statusFilter ( self, pageGet ): '''Filter out failed page accesses. [ pageGet is a PageGet instance -> if pageGet.status is less than self.BAD_STATUS_THRESHOLD -> return True else -> return False ] ''' return pageGet.status < self.BAD_STATUS_THRESHOLD # - - - A c c e s s S u m m a r y . _ _ e x t F i l t e r def __extFilter ( self, pageGet ): '''Filter out selected file extensions. [ pageGet is a PageGet instance -> if the file extension of pageGet.url is not found in self.IGNORED_EXTENSIONS -> return True else -> return False ] ''' #-- 1 -- # [ ext := file extension from pageGet.url, lowercased ] front, back = os.path.splitext ( pageGet.url ) ext = back.lower() #-- 2 -- # [ if ext is in self.IGNORED_EXTENSIONS -> # return False # else -> return True ] return ext not in self.IGNORED_EXTENSIONS # - - - A c c e s s S u m m a r y . _ _ s p i d e r F i l t e r def __spiderFilter ( self, pageGet ): '''Filter out accesses by search engine spiders. [ pageGet is a PageGet instance -> if no string in self.SPIDER_STRINGS is found in pageGet.accessor -> return True else -> return False ] ''' for s in self.SPIDER_STRINGS: if s in pageGet.accessor: return False return True # - - - A c c e s s S u m m a r y . _ _ p w d F i l t e r def __pwdFilter ( self, pageGet ): '''Filter out password-protected pages. [ pageGet is a PageGet instance -> if pageGet pertains to a page that is not password-protected -> return True else -> return False ] ''' return pageGet.username == '-' # - - - A c c e s s S u m m a r y . _ _ t i m e F i l t e r def __timeFilter ( self, pageGet ): '''Filter out expired records. [ pageGet is a PageGet instance -> if pageget.when is in the interval [self.cutoffTime, self.now] -> return True else -> return False ] ''' return self.cutoffTime <= pageGet.when <= self.now # - - - A c c e s s S u m m a r y . _ _ s p e c i a l F i l t e r def __specialFilter ( self, pageGet ): '''Filter out special cases. [ pageGet is a PageGet instance -> if pageGet is something we would prefer to ignore -> return False else -> return True ] ''' #-- 1 -- if pageGet.url.startswith("/robots.txt"): return False #-- 2 -- if pageGet.url.startswith("/~ "): return False #-- 3 -- if pageGet.url[2:3].isupper(): return False #-- 4 -- if pageGet.url.startswith("http:"): return False else: return True FILTER_FUNCTIONS = (__statusFilter, __extFilter, __spiderFilter, __pwdFilter, __timeFilter, __specialFilter) # - - - A c c e s s S u m m a r y . _ _ a d d H i t def __addHit ( self, url, when, isFar ): '''Register one access. [ (url is a URL as a string) and (when is the access time as a datetime.datetime) and (isFar is True for an off-campus access, False for on-campus) -> self := self with that access added ] ''' #-- 1 -- # [ self.sumHitCount +:= one access with isFar=isFar ] self.sumHitCount.addHit ( isFar ) #-- 2 -- # [ if pageGet.when < self.oldestHit -> # self.oldestHit := pageGet.when # else -> I ] self.oldestHit = min ( self.oldestHit, when ) #-- 3 -- # [ self.__urlMap +:= one access for url=url and # isFar=isFar ] self.__addUrl ( url, isFar ) #-- 4 -- # [ if url is "/" -> # I # else if url is for a personal page -> # self.__personalLetterMap +:= entry for the third # character of url # self.__personalMap +:= entry for url # else -> # self.__officialLetterMap +:= entry for the second # character of url # self.__officialMap +:= entry for url ] self.__addCategory(url) # - - - A c c e s s S u m m a r y . _ _ a d d U r l def __addUrl ( self, url, isFar ): '''Update self.__urlMap. [ (url is an URL as a string) and (isFar is True for off-campus accessor, else False) -> self.__urlMap +:= one access for url=url and isFar=isFar ] ''' #-- 1 -- # [ if self.__urlMap does not have url as a key -> # self.__urlMap := hitCount := a new HitCount # instance for url=url # else -> # hitCount := self.__urlMap[url] ] try: hitCount = self.__urlMap[url] except KeyError: hitCount = self.__urlMap[url] = HitCount ( url ) #-- 2 -- # [ if isFar -> # hitCount := hitCount with one total access added # and one remote access added # else -> # hitCount := hitCount with one total access added ] hitCount.addHit ( isFar ) # - - - A c c e s s S u m m a r y . _ _ a d d C a t e g o r y def __addCategory ( self, url ): '''Add this URL to the personal or official dictionaries. [ url is an URL as a string -> if url is "/" or otherwise length 1 -> I else if url is for a personal page -> self.__personalLetterMap +:= entry for the third character of url self.__personalMap +:= entry for url else -> self.__officialLetterMap +:= entry for the second character of url self.__officialMap +:= entry for url ] ''' #-- 1 -- if len(url) < 2: return else: maybeTilde = url[1] #-- 2 -- # [ if maybeTilde != '~' -> # self.__officialMap[directory name from url] +:= url # return # else -> I ] if maybeTilde != '~': #-- 2.1 -- # [ dirName := portion of url[1:] up to the next # "/" if there is one, or to the end otherwise ] dirName = url[1:].split('/')[0] #-- 2.2 -- # [ self._officialMap[dirName] +:= url # return ] try: self.__officialMap[dirName].add(url) except KeyError: self.__officialMap[dirName] = set ( [url] ) return #-- 3 -- # [ dirName := portion of url[2:] up to the next "/" if # there is one, or to the end otherwise # first := url[2] ] dirName = url[2:].split('/')[0] first = url[2] #-- 4 -- # [ self.__personalLetterMap[first] +:= dirName # self.__personalMap[dirName] +:= url ] try: self.__personalLetterMap[first].add ( dirName ) except KeyError: self.__personalLetterMap[first] = set ( [dirName] ) try: self.__personalMap[dirName].add ( url ) except KeyError: self.__personalMap[dirName] = set ( [url] ) # - - - A c c e s s S u m m a r y . g e t U r l def getUrl ( self, url ): '''Retrieve the HitCount for a given URL. ''' return self.__urlMap[url] # - - - A c c e s s S u m m a r y . g e n B y H i t s def genByHits ( self ): '''Generate the hit parade. ''' #-- 1 -- for hitCount in sorted(self.__urlMap.values()): yield hitCount #-- 2 -- raise StopIteration # - - - A c c e s s S u m m a r y . g e n P e r s o n a l L e t t e r s def genPersonalLetters(self): '''Generate the first characters of personal accounts. ''' #-- 1 -- for letter in sorted(self.__personalLetterMap.keys()): yield letter #-- 2 -- raise StopIteration # - - - A c c e s s S u m m a r y . g e n P e r s o n a l s def genPersonals ( self, first ): '''Generate all personal account names starting with 'first'. ''' #-- 1 -- # [ if self.__personalMap has a key (first) -> # nameSet := the corresponding value # else -> raise StopIteration ] try: nameSet = self.__personalLetterMap[first] except KeyError: raise StopIteration #-- 2 -- # [ generate the values in nameSet in ascending order ] for name in sorted(nameSet): yield name #-- 3 -- raise StopIteration # - - - A c c e s s S u m m a r y . g e n O f f i c i a l s def genOfficials(self): '''Generate the official director names, sorted. ''' #-- 1 -- for dirName in sorted(self.__officialMap.keys()): yield dirName #-- 2 -- raise StopIteration # - - - A c c e s s S u m m a r y . g e n P e r s o n U r l s def genPersonUrls ( self, name ): '''Generate all URLs for a personal account. ''' #-- 1 -- # [ if name is a key in self.__personalMap -> # urlSet := the corresponding value # else -> raise StopIteration ] try: urlSet = self.__personalMap[name] except KeyError: raise StopIteration #-- 2 -- # [ generate the members of urlSet in ascending order ] for url in sorted(urlSet): yield url #-- 3 -- raise StopIteration # - - - A c c e s s S u m m a r y . g e n O f f i c i a l U r l s def genOfficialUrls ( self, name ): '''Generate all URLs for a officialal account. ''' #-- 1 -- # [ if name is a key in self.__officialMap -> # urlSet := the corresponding value # else -> raise StopIteration ] try: urlSet = self.__officialMap[name] except KeyError: raise StopIteration #-- 2 -- # [ generate the members of urlSet in ascending order ] for url in sorted(urlSet): yield url #-- 3 -- raise StopIteration # - - - - - c l a s s H i t C o u n t class HitCount(object): '''Represents the access data for one URL in the report period. Exports: HitCount ( url ): [ url is a URL as a string -> return a new HitCount instance for that URL and zero access counts ] .url: [ as passed to constructor, read-only ] .nTotal: [ total hits in self as an int ] .nFar: [ total off-campus hits in self as an int ] .addHit ( isFar ): [ isFar is True for off-campus, False otherwise -> if isFar -> self := self with one additional total hit and one additional off-campus hit else -> self := self with one additional total hit ] .__cmp__(self, other): [ if self.nTotal < other.nTotal -> return a positive int else self.nTotal > other.nTotal -> return a negative int else -> return cmp(self.url, other.url) ] ''' __slots__ = ('nTotal', 'nFar', 'url') # - - - H i t C o u n t . _ _ i n i t _ _ def __init__ ( self, url ): '''Constructor for HitCount. ''' self.url = url self.nTotal = self.nFar = 0 # - - - H i t C o u n t . a d d H i t def addHit ( self, isFar ): '''Add one hit from the given accessor URL. ''' #-- 1 -- self.nTotal += 1 #-- 2 -- if isFar: self.nFar += 1 # - - - H i t C o u n t . _ _ c m p _ _ def __cmp__ ( self, other ): '''Sort descending by .ntotal, ascending by .url ''' return cmp ( (other.nTotal, self.url), (self.nTotal, other.url) ) # - - - H i t C o u n t . _ _ l t _ _ def __lt__ ( self, other ): '''Sort descending by .ntotal, ascending by .url ''' return (other.nTotal, self.url) < (self.nTotal, other.url) #================================================================ # Epilogue #---------------------------------------------------------------- if __name__ == '__main__': main()