In [1]:
import pandas as pd
In [8]:
df = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/sales.csv')
df.head()
Out[8]:
month eggs salt spam
0 Jan 47 12.0 17
1 Feb 110 50.0 31
2 Mar 221 89.0 72
3 Apr 77 87.0 20
4 May 132 NaN 52

filtering and slicing

In [13]:
df[df['eggs']>132]
Out[13]:
month eggs salt spam
2 Mar 221 89.0 72
5 Jun 205 60.0 55
In [15]:
df[(df['eggs']>110) & (df['spam']>31)]
Out[15]:
month eggs salt spam
2 Mar 221 89.0 72
4 May 132 NaN 52
5 Jun 205 60.0 55
In [61]:
df = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/titanic.csv')
df.head()
Out[61]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.00 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.00 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
In [62]:
print(df.dropna(how = 'any').shape)
(0, 14)
In [63]:
df.loc[:,df.any()].shape#this gives any the columns having any missing values
Out[63]:
(1309, 14)
In [64]:
df.loc[:,df.all()].shape#this will give all the columns having non missing values
Out[64]:
(1309, 10)
In [65]:
#to know NaN values in the dataset we can use isnull() and notnull()
df.loc[:,df.isnull().any()].shape#this will give all the columns having any nan values...
Out[65]:
(1309, 7)
In [66]:
df.loc[:,df.notnull().all()].shape#this will return all the columns having not nan values.
Out[66]:
(1309, 7)
In [67]:
df.isnull().any()#boolean to know where is null value.
Out[67]:
pclass       False
survived     False
name         False
sex          False
age           True
sibsp        False
parch        False
ticket       False
fare          True
cabin         True
embarked      True
boat          True
body          True
home.dest     True
dtype: bool
In [68]:
df.notnull().all()
Out[68]:
pclass        True
survived      True
name          True
sex           True
age          False
sibsp         True
parch         True
ticket        True
fare         False
cabin        False
embarked     False
boat         False
body         False
home.dest    False
dtype: bool
In [69]:
print(df.dropna(thresh = 10,axis = 'columns').info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1046 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
dtypes: float64(3), int64(4), object(7)
memory usage: 143.2+ KB
None
In [70]:
male_female = {'male':1,'female':0}
df['gender'] = df.sex.map(male_female)
df.head()
Out[70]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest gender
0 1 1 Allen, Miss. Elisabeth Walton female 29.00 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO 0
1 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON 1
2 1 0 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON 0
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.00 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON 1
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON 0
In [71]:
# the z-score is the number of standard deviations by which an observation is above the mean - so if it is negative, 
#it means the observation is below the mean.
In [78]:
mean = df['age'].mean()
df['age'] = df['age'].fillna(mean)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 16 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
name         1309 non-null object
sex          1309 non-null object
age          1309 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1308 non-null float64
cabin        295 non-null object
embarked     1307 non-null object
boat         486 non-null object
body         121 non-null float64
home.dest    745 non-null object
gender       1309 non-null int64
age_score    0 non-null float64
dtypes: float64(4), int64(5), object(7)
memory usage: 163.7+ KB
In [79]:
from scipy.stats import zscore 
score = zscore(df['age'])
df['age_score'] = score
print(df.head())
   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

     age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.00      0      0   24160  211.3375       B5        S    2    NaN   
1   0.92      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.00      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  gender  age_score  
0                     St Louis, MO       0  -0.068420  
1  Montreal, PQ / Chesterville, ON       1  -2.248837  
2  Montreal, PQ / Chesterville, ON       0  -2.164975  
3  Montreal, PQ / Chesterville, ON       1   0.009230  
4  Montreal, PQ / Chesterville, ON       0  -0.379021  
In [9]:
import pandas as pd
df = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/sales.csv')
df
Out[9]:
month eggs salt spam day
0 Jan 47 12.0 17 1
1 Feb 110 50.0 31 1
2 Mar 221 89.0 72 1
3 Apr 77 87.0 20 2
4 May 132 NaN 52 2
5 Jun 205 60.0 55 2
In [14]:
df1 = df.set_index(['day','month'])
In [15]:
print(df1.loc[[1,2]])#this is outer slicing of index.
           eggs  salt  spam
day month                  
1   Jan      47  12.0    17
    Feb     110  50.0    31
    Mar     221  89.0    72
2   Apr      77  87.0    20
    May     132   NaN    52
    Jun     205  60.0    55
In [17]:
print(df1.loc[(1,'Jan')])# indexing individual rows..
eggs    47.0
salt    12.0
spam    17.0
Name: (1, Jan), dtype: float64
In [50]:
df1.loc[(2,['Apr','May']),'eggs']# fancy indexing(innermost)
Out[50]:
day  month
2    Apr       77
     May      132
Name: eggs, dtype: int64
In [ ]:
df1.loc[(slice(None),slice('Jan','Mar')),:] # slicing both index..
In [ ]:
# Create the list of new indexes: new_idx
new_idx = [i.upper() for i in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)
In [ ]:
# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)
In [58]:
user = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/users.csv',index_col = 0)
user
Out[58]:
weekday city visitors signups
0 Sun Austin 139 7
1 Sun Dallas 237 12
2 Mon Austin 326 3
3 Mon Dallas 456 5
In [60]:
new = user.set_index(['weekday','city'])
new.head()
Out[60]:
visitors signups
weekday city
Sun Austin 139 7
Dallas 237 12
Mon Austin 326 3
Dallas 456 5
In [61]:
unstack = new.unstack(level = 'city')#unstack is like pivoting but pivoting doesnot work with multiple indexing.
unstack
Out[61]:
visitors signups
city Austin Dallas Austin Dallas
weekday
Mon 326 456 3 5
Sun 139 237 7 12
In [63]:
stack = unstack.stack(level = 'city')# stack and unstack are opposite of each other..
stack.head()
Out[63]:
visitors signups
weekday city
Mon Austin 326 3
Dallas 456 5
Sun Austin 139 7
Dallas 237 12
In [65]:
swap = stack.swaplevel(0,1).sort_index()#with the help of weap leve we can change the ordering of index..
swap
Out[65]:
visitors signups
city weekday
Austin Mon 326 3
Sun 139 7
Dallas Mon 456 5
Sun 237 12

Grouping by multiple columns and filtering

In [66]:
titanic = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/titanic.csv')
titanic.head()
Out[66]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.00 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.00 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
In [68]:
by_class = titanic.groupby('pclass').count()
In [70]:
print(by_class['survived'])
pclass
1    323
2    277
3    709
Name: survived, dtype: int64
In [71]:
two_group = titanic.groupby(['pclass','embarked']).count()
print(two_group['survived'])
pclass  embarked
1       C           141
        Q             3
        S           177
2       C            28
        Q             7
        S           242
3       C           101
        Q           113
        S           495
Name: survived, dtype: int64
#we can also group by two datasets together... # Read life_fname into a DataFrame: life life = pd.read_csv(life_fname, index_col='Country') # Read regions_fname into a DataFrame: regions regions = pd.read_csv(regions_fname , index_col= 'Country') # Group life by regions['region']: life_by_region life_by_region = life.groupby(regions['region']) # Print the mean over the '2010' column of life_by_region print(life_by_region['2010'].mean())
In [75]:
agg = titanic.groupby('pclass')[['age','fare']].agg(['max','min'])
In [77]:
print(agg)
         age            fare     
         max   min       max  min
pclass                           
1       80.0  0.92  512.3292  0.0
2       70.0  0.67   73.5000  0.0
3       74.0  0.17   69.5500  0.0
In [81]:
gap = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/gapminder_tidy.csv',index_col = ['Year','region'
                                                                                                       ,'Country'])
gap.head()
Out[81]:
fertility life population child_mortality gdp
Year region Country
1964 South Asia Afghanistan 7.671 33.639 10474903.0 339.7 1182.0
1965 South Asia Afghanistan 7.671 34.152 10697983.0 334.1 1182.0
1966 South Asia Afghanistan 7.671 34.662 10927724.0 328.7 1168.0
1967 South Asia Afghanistan 7.671 35.170 11163656.0 323.3 1173.0
1968 South Asia Afghanistan 7.671 35.674 11411022.0 318.1 1187.0
In [84]:
df = gap.groupby(['Year','region'])

# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

aggregate = {'population':'sum','child_mortality':'mean','gdp':spread}

total = df.agg(aggregate)
total.head()
Out[84]:
gdp child_mortality population
Year region
1964 America 18314.0 113.950667 4.621957e+08
East Asia & Pacific 66821.0 129.109130 1.110668e+09
Europe & Central Asia 28734.0 61.585319 6.988545e+08
Middle East & North Africa 38474.0 179.605263 1.180955e+08
South Asia 812.0 256.922500 6.250739e+08
In [85]:
#Filling missing data (imputation) by group

# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex','pclass'])

#.groupby() and .transform() to fill missing data appropriately for each group
# Write a function that imputes median
#'age' values for passengers on the Titanic with the median age from their 'gender' and 'pclass'. To do this, 
#you'll group by the 'sex' and 'pclass' columns and transform each group with a custom function to call .fillna() 
#and impute the median value

def impute_median(series):
    return series.fillna(series.median())
In [87]:
titanic.tail()
Out[87]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
1304 3 0 Zabour, Miss. Hileni female 14.5 1 0 2665 14.4542 NaN C NaN 328.0 NaN
1305 3 0 Zabour, Miss. Thamine female 22.0 1 0 2665 14.4542 NaN C NaN NaN NaN
1306 3 0 Zakarian, Mr. Mapriededer male 26.5 0 0 2656 7.2250 NaN C NaN 304.0 NaN
1307 3 0 Zakarian, Mr. Ortin male 27.0 0 0 2670 7.2250 NaN C NaN NaN NaN
1308 3 0 Zimmerman, Mr. Leo male 29.0 0 0 315082 7.8750 NaN S NaN NaN NaN
In [88]:
# Impute age and assign to titanic['age']
titanic['age'] = by_sex_class['age'].transform(impute_median)

# Print the output of titanic.tail(10)
print(titanic.tail(10))
      pclass  survived                                     name     sex   age  \
1299       3         0                      Yasbeck, Mr. Antoni    male  27.0   
1300       3         1  Yasbeck, Mrs. Antoni (Selini Alexander)  female  15.0   
1301       3         0                     Youseff, Mr. Gerious    male  45.5   
1302       3         0                        Yousif, Mr. Wazli    male  25.0   
1303       3         0                    Yousseff, Mr. Gerious    male  25.0   
1304       3         0                     Zabour, Miss. Hileni  female  14.5   
1305       3         0                    Zabour, Miss. Thamine  female  22.0   
1306       3         0                Zakarian, Mr. Mapriededer    male  26.5   
1307       3         0                      Zakarian, Mr. Ortin    male  27.0   
1308       3         0                       Zimmerman, Mr. Leo    male  29.0   

      sibsp  parch  ticket     fare cabin embarked boat   body home.dest  
1299      1      0    2659  14.4542   NaN        C    C    NaN       NaN  
1300      1      0    2659  14.4542   NaN        C  NaN    NaN       NaN  
1301      0      0    2628   7.2250   NaN        C  NaN  312.0       NaN  
1302      0      0    2647   7.2250   NaN        C  NaN    NaN       NaN  
1303      0      0    2627  14.4583   NaN        C  NaN    NaN       NaN  
1304      1      0    2665  14.4542   NaN        C  NaN  328.0       NaN  
1305      1      0    2665  14.4542   NaN        C  NaN    NaN       NaN  
1306      0      0    2656   7.2250   NaN        C  NaN  304.0       NaN  
1307      0      0    2670   7.2250   NaN        C  NaN    NaN       NaN  
1308      0      0  315082   7.8750   NaN        S  NaN    NaN       NaN  
In [90]:
# Grouping and filtering with .apply()
#before aggregating we have to filter first.
gender= titanic.groupby('sex')
def c_deck_survival(gr):

    c_passengers = gr['cabin'].str.startswith('C').fillna(False)

    return gr.loc[c_passengers, 'survived'].mean()
In [91]:
survive = gender.apply(c_deck_survival)
print(survive)
sex
female    0.913043
male      0.312500
dtype: float64
In [92]:
#fltering and grouping with .filter()
gender= titanic.groupby('sex')
df = gender.filter(lambda x:x['fare'].sum()>20)
In [94]:
df.head()
Out[94]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.00 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.00 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.00 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
In [96]:
# Filtering and grouping with .map()
#the Boolean Series: under10
mapped = (titanic['age']<10).map({True:'under10',False:'over10'})
mapped.head()
Out[96]:
0     over10
1    under10
2    under10
3     over10
4     over10
Name: age, dtype: object
In [101]:
grp_by = titanic.groupby(mapped)[['survived','pclass']].mean()
print(grp_by)
         survived    pclass
age                        
over10   0.366748  2.272209
under10  0.609756  2.634146
In [111]:
summer_Olympic = pd.read_csv('/Users/shivampandey/Downloads/python files/datasets/Summer Olympic medallists 1896 to 2008 - ALL MEDALISTS.csv')
print(summer_Olympic.head())
print(summer_Olympic.shape)
     City  Edition     Sport Discipline             Athlete  NOC Gender  \
0  Athens     1896  Aquatics   Swimming       HAJOS, Alfred  HUN    Men   
1  Athens     1896  Aquatics   Swimming    HERSCHMANN, Otto  AUT    Men   
2  Athens     1896  Aquatics   Swimming   DRIVAS, Dimitrios  GRE    Men   
3  Athens     1896  Aquatics   Swimming  MALOKINIS, Ioannis  GRE    Men   
4  Athens     1896  Aquatics   Swimming  CHASAPIS, Spiridon  GRE    Men   

                        Event Event_gender   Medal  
0              100m freestyle            M    Gold  
1              100m freestyle            M  Silver  
2  100m freestyle for sailors            M  Bronze  
3  100m freestyle for sailors            M    Gold  
4  100m freestyle for sailors            M  Silver  
(29216, 10)
In [113]:
slic = summer_Olympic.loc[summer_Olympic.NOC == 'GRE'].groupby('Edition')
In [116]:
print(slic)
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x11170aed0>
In [127]:
medal = summer_Olympic.Medal.value_counts()
medal
Out[127]:
Gold      9850
Bronze    9689
Silver    9677
Name: Medal, dtype: int64
In [129]:
#i want to know how many medals are won by each athlete in NOC.
#for this i have to use pivot_table
#Noc is fixed,medal is going to pivot and athlete will fill the values corresponding to medals,and aggfunc will 
#treat the repeating values as count...
medals = summer_Olympic.pivot_table(index = 'NOC',columns = 'Medal',values = 'Athlete',aggfunc = 'count')
medals.head()
Out[129]:
Medal Bronze Gold Silver
NOC
AFG 1.0 NaN NaN
AHO NaN NaN 1.0
ALG 8.0 4.0 2.0
ANZ 5.0 20.0 4.0
ARG 88.0 68.0 83.0
In [130]:
medals['total'] = medals.sum(axis = 'columns')
In [137]:
print(medals.sort_values('total',ascending = False).head())
Medal  Bronze    Gold  Silver   total
NOC                                  
USA    1052.0  2088.0  1195.0  4335.0
URS     584.0   838.0   627.0  2049.0
GBR     505.0   498.0   591.0  1594.0
FRA     475.0   378.0   461.0  1314.0
ITA     374.0   460.0   394.0  1228.0
In [143]:
new = (summer_Olympic['Event_gender']=='W') & (summer_Olympic['Gender'] == 'Men')
print(summer_Olympic[new])
         City  Edition      Sport Discipline            Athlete  NOC Gender  \
23675  Sydney     2000  Athletics  Athletics  CHEPCHUMBA, Joyce  KEN    Men   

          Event Event_gender   Medal  
23675  marathon            W  Bronze  
In [149]:
#aggregate the number of distinct sports in which the USA and the USR won medals.
#rows for which 'NOC' is either 'USA' or 'URS'
filtr = summer_Olympic.NOC.isin(['USA','URS'])

booln = (summer_Olympic.Edition >=1952) & (summer_Olympic.Edition <= 1988)
# to create the DataFrame:i am using booln and filtr
df1 = summer_Olympic.loc[booln & filtr]
df1.head()
Out[149]:
City Edition Sport Discipline Athlete NOC Gender Event Event_gender Medal
8019 Helsinki 1952 Aquatics Diving LEE, Samuel USA Men 10m platform M Gold
8021 Helsinki 1952 Aquatics Diving STOVER-IRWIN, Juno Roslays USA Women 10m platform W Bronze
8022 Helsinki 1952 Aquatics Diving MCCORMICK, Patricia USA Women 10m platform W Gold
8023 Helsinki 1952 Aquatics Diving MYERS-POPE, Paula Jean USA Women 10m platform W Silver
8024 Helsinki 1952 Aquatics Diving CLOTWORTHY, Robert Lynn USA Men 3m springboard M Bronze
In [155]:
grp = df1.groupby('NOC')
new1 = grp['Sport'].nunique().sort_values(ascending = False)
new1
Out[155]:
NOC
URS    21
USA    20
Name: Sport, dtype: int64
In [158]:
#which country, the USA or the USSR, won the most medals...
new_table = summer_Olympic.pivot_table(index = 'Edition',columns = 'NOC',values = 'Athlete',aggfunc = 'count')
accessors = new_table.loc['1952':'1988',['USA','URS']]
access = accessors.idxmax(axis = 'columns')
print(access)
Edition
1952    USA
1956    URS
1960    URS
1964    URS
1968    URS
1972    URS
1976    URS
1980    URS
1984    USA
1988    URS
dtype: object
In [159]:
#Visualizing USA Medal Counts by Edition: Line Plot
usa = summer_Olympic[summer_Olympic.NOC=='USA']
usa.head()
Out[159]:
City Edition Sport Discipline Athlete NOC Gender Event Event_gender Medal
11 Athens 1896 Athletics Athletics LANE, Francis USA Men 100m M Bronze
13 Athens 1896 Athletics Athletics BURKE, Thomas USA Men 100m M Gold
15 Athens 1896 Athletics Athletics CURTIS, Thomas USA Men 110m hurdles M Gold
19 Athens 1896 Athletics Athletics BLAKE, Arthur USA Men 1500m M Silver
21 Athens 1896 Athletics Athletics BURKE, Thomas USA Men 400m M Gold
In [164]:
grp = usa.groupby(['Edition','Medal'])['Athlete'].agg('count')
grp.head()
#with this its not able to plot any graph..so we have to unstack this.
Out[164]:
Edition  Medal 
1896     Bronze     2
         Gold      11
         Silver     7
1900     Bronze    14
         Gold      27
Name: Athlete, dtype: int64
In [165]:
medls = grp.unstack(level = 'Medal')
medls.head()
Out[165]:
Medal Bronze Gold Silver
Edition
1896 2 11 7
1900 14 27 14
1904 111 146 137
1908 15 34 14
1912 31 45 25
In [167]:
import matplotlib.pyplot as plt
medls.plot()
plt.show()
In [168]:
medls.plot.area()
plt.show()
In [ ]:
# medals are ordered according to a lexicographic (dictionary) ordering: Bronze < Gold < Silver. 
#However, we can change the order as well: Bronze < Silver < Gold.
# Redefine 'Medal' as an ordered categorical
medals.Medal = pd.Categorical(values = medals.Medal,categories=['Bronze', 'Silver', 'Gold'],ordered=True)