Bruger:Wegge/Statistik/DumpToDat.py
Fra Wikipedia, den frie encyklopædi
Many eys makes all bugs shallow
Dette er version 0.1 af det program, skrevet i Python, som jeg bruger til at maltraktere et XML-dump, så der kommer plotbare data ud af det. Det er ikke testet synderlig meget, så hvis du finder en fejl, vil jeg gerne høre om den. Programmet er frigivet under GPL, hvilket er tilstrækkeligt tæt på GFDL, til at kildeteksten kan ligge på Wikipedia.
#! /usr/bin/python # -*- coding: utf-8 -*- # # Copyright(2006) Anders Wegge Jakobsen # Available under the FSF GPL license # from xml.sax import saxutils from xml.sax import make_parser import sys, time, exceptions, calendar import profile # Page::Title, Page::FirstRevDate # User::FirstEditDate # Date::Users, Date::Pages debug = 0 danamespaces = [ 'Media', 'Speciel', 'Diskussion', 'Bruger', 'Bruger diskussion', 'Wikipedia', 'Wikipedia diskussion', 'Billede', 'Billede diskussion', 'MediaWiki', 'MediaWiki diskussion', 'Skabelon', 'Skabelon diskussion', u'Hjælp', u'Hjælp diskussion', 'Kategori', 'Kategori diskussion', 'WikiWegge' ] darobots = [ 'WeggeBot', 'TwidRobot' ] class DayStat: def __init__(self): self.TimeStamp = 0 self.NewArticlesUser = 0 self.NewArticlesRobot = 0 self.NewUsers = 0 class WikiArticle: def __init__ (self): self.Title = '' self.Oldest = 0 self.Newest = 0 self.isArticleNs = False self.isRobotGen = False self.isRedirect = False class WikiUser: def __init__ (self): self.Name = '' self.First = 0 self.Last = 0 class WikiDump (saxutils.DefaultHandler): def __init__(self): # Parser state self.nowIn = [] self.UserName = [] self.Title = [] self.Text = [] self.TimeStamp = 0 # Users -> First edit date self.Users = {} # Page -> First revision, Title, isRedirect self.Pages = {} def isRedirect(self, text): # Is this page a redirect? if text.find("#REDIRECT [[") == 0: return True else: return False def isRealArticle(self, text): # Is this page a real article? if text.find("[[") == 0: return False if self.isRedirect(text): return False else: return True def isArticleNs(self, title): # Is this an article in the main namespace? if title.find(':') != -1: for ns in danamespaces: if title.count(ns + ':') == 0: return False else: return True def isRobot(self, contributor): # Is this an article in the main namespace? if darobots.count(contributor) == 0: return False else: return True def characters(self, ch): if self.nowIn[-1] == 'title': self.Title.append(ch) if self.nowIn[-1] == 'timestamp': self.TimeStampText.append(ch) if self.nowIn[-1] == 'username': self.UserName.append(ch) if self.nowIn[-1] == 'text': self.Text.append(ch) def startElement (self, name, attrs): self.nowIn.append(name) if debug > 9: print '>> ' + name if name == 'title': self.Title = [] if name == 'timestamp': self.TimeStampText = [] if name == 'username': self.UserName = [] if name == 'text': self.Text = [] def endElement (self, name): if debug > 9: print '<<' + name if self.nowIn.pop() != name: print 'Something is rotten!' print 'Removing: ' + name + ' from: ' print self.nowIn raise Exception if name == 'title': self.Title = "".join(self.Title) return if name == 'revision': self.Text = "".join(self.Text) # We now have the revision text and timestamp if self.Pages.has_key(self.Title): wa = self.Pages[self.Title] if wa.Oldest > self.TimeStamp: wa.isRobotGen = self.isRobot(self.UserName) if wa.Newest < self.TimeStamp: wa.isRedirect = self.isRedirect(self.Text) else: wa = WikiArticle() wa.isArticleNs = self.isArticleNs(self.Title) wa.Title = self.Title wa.Oldest = 0 + self.TimeStamp # Trip wa.Newest = self.TimeStamp wa.isRobotGen = self.isRobot(self.UserName) wa.isRedirect = self.isRedirect(self.Text) self.Pages[self.Title] = wa return if name == 'timestamp': self.TimeStampText = "".join(self.TimeStampText) tmt = time.strptime(self.TimeStampText, '%Y-%m-%dT%H:%M:%SZ') self.TimeStamp = calendar.timegm((tmt.tm_year, tmt.tm_mon, tmt.tm_mday, 0, 0, 0)) return if name == 'contributor': # Was it a registered user? self.UserName = "".join(self.UserName) if self.Users.has_key(self.UserName): wu = self.Users[self.UserName] if wu.First > self.TimeStamp: wu.First = self.TimeStamp if wu.Last < self.TimeStamp: wu.Last = self.TimeStamp self.Users[self.UserName] = wu elif self.UserName: wu = WikiUser() wu.Name = self.UserName wu.First = self.TimeStamp wu.Last = self.TimeStamp self.Users[self.UserName] = wu return def main(): filename = 'dawiki-20060220-pages-meta-history.xml' # filename = '/tmp/wikiwegge.xml' wd = WikiDump() parser = make_parser() parser.setContentHandler(wd) parser.parse(filename) hist = {} for i in wd.Users.values(): if hist.has_key(i.First): h = hist[i.First] else: h = DayStat() h.TimeStamp = i.First h.NewUsers += 1 hist[i.First] = h for i in wd.Pages.values(): if hist.has_key(i.Oldest): h = hist[i.Oldest] else: h = DayStat() h.TimeStamp = i.Oldest if i.isRobotGen and not i.isRedirect and i.isArticleNs: h.NewArticlesRobot += 1 elif not i.isRobotGen and not i.isRedirect and i.isArticleNs: h.NewArticlesUser += 1 hist[i.Oldest] = h NewPagesUsersByDate = hist.values() NewPagesUsersByDate.sort() # # And now ... output # print '#\n# Stats based on %s\n#\n' % filename print '# Timestamp, Number of users, Total pages by user,' \ +' Total pages by robots' users = 0 userpages = 0 botpages = 0 for i in NewPagesUsersByDate: users += i.NewUsers botpages += i.NewArticlesRobot userpages += i.NewArticlesUser print '%d %d %d %d' % (i.TimeStamp, users, userpages, botpages) if __name__ == "__main__": main()