Top Wikipedia stories in 2014

2 min read Original article ↗

@brianckeegan

  • Save brianckeegan/d11452963fb0570fb461 to your computer and use it in GitHub Desktop.
Save brianckeegan/d11452963fb0570fb461 to your computer and use it in GitHub Desktop.

Top Wikipedia stories in 2014

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup, element
import urllib2, re
# Read the HTML from the webpage on Wikipedia stats and convert to soup
soup = BeautifulSoup(urllib2.urlopen('http://stats.wikimedia.org/EN/TablesWikipediaEN.htm').read())
# Look for all the paragraphs with 2014
_p = soup.findAll('b',text=re.compile('2014'))
# Select only those paragraph parents that have exactly 152 fields, corresponding to the top-25 lists
_p2014 = [t.parent for t in _p if len(t.parent) == 152]
# Get the text out of the children tags as a list of lists
parsed = [[t.text for t in list(p.children) if type(t) != element.NavigableString] for p in _p2014]
# Convert to a dictionary keyed by month abbreviation with values as the list of text fields
parsed = {month[0].split(u'\xa0')[0]:month[1:] for month in parsed}
# Do some crazy dictionary and list comprehensions with zips to convert the values in the list
parsed = {k:[{'rank':int(a),'editors':int(b),'article':c} for a,b,c in zip(v[0::3],v[1::3],v[2::3])] for k,v in parsed.items()}
# Convert each month into a DataFrame with month information in the index
# and then concat all the dfs together, sorting on those with the most editors
ranked = pd.concat([pd.DataFrame(parsed[i],index=[i]*len(parsed[i])) for i in parsed.keys()]).sort('editors',ascending=False).reset_index()
# rename the reset index to something meaningful
ranked.rename(columns={'index':'month'},inplace=True)
# Group the articles by name, compute aggregate statistics
# Rank on the total number editors and months in the top 25
top_articles = ranked.groupby('article').agg({'month':len,'editors':np.sum,'rank':np.min}).sort(['month','editors'],ascending=False)
top_articles