The results of the analysis are described in my Data Analysis Blog.
from __future__ import division #this script is written in Python 2.x import pandas as pd import datetime as dt #required later to calculate age end = pd.to_datetime('20160821').date() def correct_age(x): if x < 0: x = x + 100 return x #get the data, convert the 'dob' string to date type df = pd.read_csv('athletes.csv', parse_dates=[4]) #add a column with total number of medals won df['medals'] = df['gold'] + df['silver'] + df['bronze'] #add a column with age df['age'] = end  pd.to_datetime(df.dob) df['age'] = df['age'].astype('<m8[Y]') df['age'] = df.age.apply(correct_age) #create new dataframes for male and female and for athletes wo won at least one medal df_m = df.copy(deep=True) df_m = df_m.drop(df_m[df_m.sex!='male'].index) df_m_nz = df_m.copy(deep=True) df_m_nz = df_m_nz.drop(df_m_nz[df_m_nz.medals==0].index) df_m_z = df_m.copy(deep=True) df_m_z = df_m_z.drop(df_m_z[df_m_z.medals!=0].index) df_f = df.copy(deep=True) df_f = df_f.drop(df_f[df_f.sex!='female'].index) df_f_nz = df_f.copy(deep=True) df_f_nz = df_f_nz.drop(df_f_nz[df_f_nz.medals==0].index) df_f_z = df_f.copy(deep=True) df_f_z = df_f_z.drop(df_f_z[df_f_z.medals!=0].index) #what percentage of male athletes won at least one medal? total_male_athletes = len(df_m.index) num_male_medal = len(df_m_nz.index) percent_m_winners = round((num_male_medal/total_male_athletes)*100,1) #what percentage of female athletes won at least one medal? total_female_athletes = len(df_f.index) num_female_medal = len(df_f_nz.index) percent_f_winners = round((num_female_medal/total_female_athletes)*100,1) print(total_male_athletes) print(num_male_medal) print(percent_m_winners) print('') print(total_female_athletes) print(num_female_medal) print(percent_f_winners) ################################################ #get the nationalities of all male athletes male_nationalities_list = df_m.nationality.unique() #how many nationalities? print(len(male_nationalities_list)) #get the nationalities of all female athletes female_nationalities_list = df_f.nationality.unique() #how many nationalities? print(len(female_nationalities_list)) #what countries sent all male teams print(list(set(male_nationalities_list)set(female_nationalities_list))) #what countries sent all male teams print(list(set(female_nationalities_list)set(male_nationalities_list))) ################################################ #if you are a male athlete what sports give you the best chance of winning multiple medals? top_m = df_m[df_m.medals>=3] top_m_sports = top_m.groupby('sport').size() print(top_m_sports.sort_values(ascending=False)) #if you are a female athlete what sports give you the best chance of winning multiple medals? top_f = df_f[df_f.medals>=3] top_f_sports = top_f.groupby('sport').size() print(top_f_sports.sort_values(ascending=False)) ################################################## print('aquatics') # is there a difference in the mean age of male atheletes compeing in aquatics who won #medals and those who did not win any medals sports_m_z_age = df_m_z['age'].groupby(df_m_z['sport']) sports_m_nz_age = df_m_nz['age'].groupby(df_m_nz['sport']) sports_m_z_age_mean = sports_m_z_age.mean() sports_m_nz_age_mean = sports_m_nz_age.mean() print(sports_m_z_age_mean['aquatics']) print(sports_m_nz_age_mean['aquatics']) #repeat for female athhletes in aquatics sports_f_z_age = df_f_z['age'].groupby(df_f_z['sport']) sports_f_nz_age = df_f_nz['age'].groupby(df_f_nz['sport']) sports_f_z_age_mean = sports_f_z_age.mean() sports_f_nz_age_mean = sports_f_nz_age.mean() print(sports_f_z_age_mean['aquatics']) print(sports_f_nz_age_mean['aquatics']) print('athletics') #medals and those who did not win any medals  athletics print(sports_m_z_age_mean['athletics']) print(sports_m_nz_age_mean['athletics']) #repeat for female athhletes in athletics print(sports_f_z_age_mean['athletics']) print(sports_f_nz_age_mean['athletics']) print('gymnastics') #medals and those who did not win any medals  gymnastics print(sports_m_z_age_mean['gymnastics']) print(sports_m_nz_age_mean['gymnastics']) #repeat for female athhletes in gymnastics print(sports_f_z_age_mean['gymnastics']) print(sports_f_nz_age_mean['gymnastics'])
0 Comments
Leave a Reply. 
This blog includes:Scripts mainly in Python with a few in R covering NLP, Pandas, Matplotlib and others. See the home page for links to some of the scripts. Also includes some explanations of basic data science terminology. Archives
October 2017
