# numerical calculation & data framesimport numpy as npimport pandas as pd# visualizationimport matplotlib.pyplot as pltimport seaborn as snsimport seaborn.objects as so# statisticsimport statsmodels.api as sm# pandas optionspd.set_option('mode.copy_on_write', True) # pandas 2.0pd.options.display.float_format ='{:.2f}'.format# pd.reset_option('display.float_format')pd.options.display.max_rows =7# max number of rows to display# NumPy optionsnp.set_printoptions(precision =2, suppress=True) # suppress scientific notation# For high resolution displayimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats("retina")
# Load a datdsettips = sns.load_dataset("tips")tips
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
.. ... ... ... ... ... ... ...
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
pd.crosstab()
두 카테고리 변수의 모든 level 쌍에 대한 count.
normalize: 비율을 계산
margins: 행과 열로 합
groupby()를 적용해 구할 수 있으나 좀 더 간결.
pd.crosstab(tips["day"], tips["time"])
time Lunch Dinner
day
Thur 61 1
Fri 7 12
Sat 0 87
Sun 0 76
# groupby를 이용tips.groupby(["day", "time"], observed=False).size() # observed: categorical type의 변수에 대한 처리 (default: True)
day time
Thur Lunch 61
Dinner 1
Fri Lunch 7
..
Sat Dinner 87
Sun Lunch 0
Dinner 76
Length: 8, dtype: int64