Cross-Tabulation and Pivot Tables

R for Data Science by Wickham & Grolemund

Author

Sungkyun Cho

Published

May 1, 2024

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")
# Load a datdset
tips = sns.load_dataset("tips")
tips
     total_bill  tip     sex smoker   day    time  size
0         16.99 1.01  Female     No   Sun  Dinner     2
1         10.34 1.66    Male     No   Sun  Dinner     3
2         21.01 3.50    Male     No   Sun  Dinner     3
..          ...  ...     ...    ...   ...     ...   ...
241       22.67 2.00    Male    Yes   Sat  Dinner     2
242       17.82 1.75    Male     No   Sat  Dinner     2
243       18.78 3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]

pd.crosstab()

두 카테고리 변수의 모든 level 쌍에 대한 count.

  • normalize: 비율을 계산
  • margins: 행과 열로 합

groupby()를 적용해 구할 수 있으나 좀 더 간결.

pd.crosstab(tips["day"], tips["time"])
time  Lunch  Dinner
day                
Thur     61       1
Fri       7      12
Sat       0      87
Sun       0      76
# groupby를 이용
tips.groupby(["day", "time"], observed=False).size()  # observed: categorical type의 변수에 대한 처리 (default: True)
day   time  
Thur  Lunch     61
      Dinner     1
Fri   Lunch      7
                ..
Sat   Dinner    87
Sun   Lunch      0
      Dinner    76
Length: 8, dtype: int64
# groupby & unstack를 이용
tips.groupby(["day", "time"], observed=False).size().unstack()
time  Lunch  Dinner
day                
Thur     61       1
Fri       7      12
Sat       0      87
Sun       0      76

Parameters

  • normalize: 비율을 계산 (index, columns, all)
  • margins: 행과 열로 합산
pd.crosstab(tips["day"], tips["time"], normalize='all', margins=True)
time  Lunch  Dinner  All
day                     
Thur   0.90    0.01 0.25
Fri    0.10    0.07 0.08
Sat    0.00    0.49 0.36
Sun    0.00    0.43 0.31
pd.crosstab(tips["day"], tips["time"], normalize='columns')
time  Lunch  Dinner
day                
Thur   0.90    0.01
Fri    0.10    0.07
Sat    0.00    0.49
Sun    0.00    0.43

pivot_table()

  • count 대신 mean
  • mean 외에 다른 aggregation 함수를 지정할 수 있음
  • grouping을 할 변수들을 (index and/or columns)과 aggregate할 변수 지정 (values)
# grouping을 할 변수들을 index에 지정, aggregate할 변수 지정
tips.pivot_table(index=["day", "time"], values="tip")  # dropna=True
             tip
day  time       
Thur Lunch  2.77
     Dinner 3.00
Fri  Lunch  2.38
     Dinner 2.94
Sat  Dinner 2.99
Sun  Dinner 3.26
# grouping을 할 변수들을 index & columns에 지정, aggregate할 변수 지정
tips.pivot_table(index="day", columns="time", values="tip")
time  Lunch  Dinner
day                
Thur   2.77    3.00
Fri    2.38    2.94
Sat     NaN    2.99
Sun     NaN    3.26
# groupby를 이용
tips.groupby(["day", "time"], observed=True)["tip"].mean()
day   time  
Thur  Lunch    2.77
      Dinner   3.00
Fri   Lunch    2.38
      Dinner   2.94
Sat   Dinner   2.99
Sun   Dinner   3.26
Name: tip, dtype: float64
# groupby를 이용
tips.groupby(["day", "time"], observed=True)["tip"].mean().unstack()
time  Lunch  Dinner
day                
Thur   2.77    3.00
Fri    2.38    2.94
Sat     NaN    2.99
Sun     NaN    3.26
# 두 개 이상의 변수에 대한 aggregation
tips.pivot_table(index="day", columns="time", values=["tip", "total_bill"])
       tip        total_bill       
time Lunch Dinner      Lunch Dinner
day                                
Thur  2.77   3.00      17.66  18.78
Fri   2.38   2.94      12.85  19.66
Sat    NaN   2.99        NaN  20.44
Sun    NaN   3.26        NaN  21.41
# groupby를 이용
tips.groupby(["day", "time"])[["tip", "total_bill"]].mean().unstack()  # observed=True
       tip        total_bill       
time Lunch Dinner      Lunch Dinner
day                                
Thur  2.77   3.00      17.66  18.78
Fri   2.38   2.94      12.85  19.66
Sat    NaN   2.99        NaN  20.44
Sun    NaN   3.26        NaN  21.41
# margins 추가
tips.pivot_table(index="day", columns="time", values="tip", margins=True)
time  Lunch  Dinner  All
day                     
Thur   2.77    3.00 2.77
Fri    2.38    2.94 2.73
Sat     NaN    2.99 2.99
Sun     NaN    3.26 3.26
All    2.73    3.10 3.00