# numerical calculation & data framesimport numpy as npimport pandas as pd# visualizationimport matplotlib.pyplot as pltimport seaborn as snsimport seaborn.objects as so# statisticsimport statsmodels.api as sm# pandas optionspd.set_option('mode.copy_on_write', True) # pandas 2.0pd.options.display.float_format ='{:.2f}'.format# pd.reset_option('display.float_format')pd.options.display.max_rows =7# max number of rows to display# NumPy optionsnp.set_printoptions(precision =2, suppress=True) # suppress scientific notation# For high resolution displayimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats("retina")
# load a datasettips = sns.load_dataset("tips")tips
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
.. ... ... ... ... ... ... ...
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
# DataFrame의 값들: ndarraytips.values # or tips.to_numpy()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
total_bill tip size
count 244.00 244.00 244.00
mean 19.79 3.00 2.57
std 8.90 1.38 0.95
min 3.07 1.00 1.00
25% 13.35 2.00 2.00
50% 17.80 2.90 2.00
75% 24.13 3.56 3.00
max 50.81 10.00 6.00
tips.describe(include="all") # all types 나열
total_bill tip sex smoker day time size
count 244.00 244.00 244 244 244 244 244.00
unique NaN NaN 2 2 4 2 NaN
top NaN NaN Male No Sat Dinner NaN
freq NaN NaN 157 151 87 176 NaN
... ... ... ... ... ... ... ...
25% 13.35 2.00 NaN NaN NaN NaN 2.00
50% 17.80 2.90 NaN NaN NaN NaN 2.00
75% 24.13 3.56 NaN NaN NaN NaN 3.00
max 50.81 10.00 NaN NaN NaN NaN 6.00
[11 rows x 7 columns]
tips.describe(include="category")
sex smoker day time
count 244 244 244 244
unique 2 2 4 2
top Male No Sat Dinner
freq 157 151 87 176
s1 = tips.value_counts("day") # "day" 칼럼에 대한 각 카테고리별 countss2 = tips.value_counts("day", sort=False) # default: sort is trues3 = tips.value_counts("day", ascending=True) # default: ascending is Falses4 = tips.value_counts("day", normalize=True) # 카테고리별 비율s5 = tips.value_counts(["sex", "smoker"]) # "sex", "smoker" 칼럼에 대한 유니크한 카테고리별 counts
Tip
.value_count()의 결과는 Series이며 그 이름은 ‘count’ 또는 ’proportion’임 (pandas 2.0)
Missing(NA)을 count하지 않으나 dropna=False을 이용해 나타낼 수 있음
tips.value_counts("day", dropna=False)
Series에 대해서도 적용되며, DataFrame으로 컬럼을 선택해 적용할 수 있음
tips["day"].value_counts() # tips["day"]: Series objecttips[["sex", "smoker"]].value_counts()
Data: palmerpenguins
# load a datasetpenguins = sns.load_dataset("penguins")penguins
species island bill_length_mm bill_depth_mm flipper_length_mm \
0 Adelie Torgersen 39.1 18.7 181.0
1 Adelie Torgersen 39.5 17.4 186.0
2 Adelie Torgersen 40.3 18.0 195.0
3 Adelie Torgersen NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0
.. ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0
341 Gentoo Biscoe 50.4 15.7 222.0
342 Gentoo Biscoe 45.2 14.8 212.0
343 Gentoo Biscoe 49.9 16.1 213.0
body_mass_g sex
0 3750.0 Male
1 3800.0 Female
2 3250.0 Female
3 NaN NaN
4 3450.0 Female
.. ... ...
339 NaN NaN
340 4850.0 Female
341 5750.0 Male
342 5200.0 Female
343 5400.0 Male
[344 rows x 7 columns]
species island sex
count 344 344 333
unique 3 3 2
top Adelie Biscoe Male
freq 152 168 168
penguins.value_counts(["island", "species"])
island species
Biscoe Gentoo 124
Dream Chinstrap 68
Adelie 56
Torgersen Adelie 52
Biscoe Adelie 44
Name: count, dtype: int64
penguins.value_counts(["sex", "species"], dropna=False) # NA은 기본적으로 생략
sex species
Female Adelie 73
Male Adelie 73
Gentoo 61
Female Gentoo 58
Chinstrap 34
Male Chinstrap 34
NaN Adelie 6
Gentoo 5
Name: count, dtype: int64
# NA의 개수penguins.isna().sum()
species 0
island 0
bill_length_mm 2
bill_depth_mm 2
flipper_length_mm 2
body_mass_g 2
sex 11
dtype: int64
# NA의 비율penguins.isna().mean()
species 0.000000
island 0.000000
bill_length_mm 0.005814
bill_depth_mm 0.005814
flipper_length_mm 0.005814
body_mass_g 0.005814
sex 0.031977
dtype: float64
tips.sort_values("tip", ascending=False)
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
23 39.42 7.58 Male No Sat Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
.. ... ... ... ... ... ... ...
236 12.60 1.00 Male Yes Sat Dinner 2
111 7.25 1.00 Female No Sat Dinner 1
67 3.07 1.00 Female Yes Sat Dinner 1
92 5.75 1.00 Female Yes Fri Dinner 2
[244 rows x 7 columns]
total_bill tip sex smoker day time size
125 29.80 4.20 Female No Thur Lunch 6
143 27.05 5.00 Female No Thur Lunch 6
156 48.17 5.00 Male No Sun Dinner 6
141 34.30 6.70 Male No Thur Lunch 6
.. ... ... ... ... ... ... ...
67 3.07 1.00 Female Yes Sat Dinner 1
111 7.25 1.00 Female No Sat Dinner 1
82 10.07 1.83 Female No Thur Lunch 1
222 8.58 1.92 Male Yes Fri Lunch 1
[244 rows x 7 columns]
tips.nlargest(3, "tip") # 다수의 동등 순위가 있을 때 처리: keep="first", "last", "all"
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
23 39.42 7.58 Male No Sat Dinner 4