Inspecting data

Mixed

Author

Sungkyun Cho

Published

March 20, 2024

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

Useful method

.head(), .tail(), .sample()
.info(), .describe(),
.value_counts(),
.sort_values(), .nlargest(), .nsmallest()

Data: Tips
일정기간 한 웨이터가 얻은 팁에 대한 데이터

# load a dataset
tips = sns.load_dataset("tips")
tips
     total_bill  tip     sex smoker   day    time  size
0         16.99 1.01  Female     No   Sun  Dinner     2
1         10.34 1.66    Male     No   Sun  Dinner     3
2         21.01 3.50    Male     No   Sun  Dinner     3
3         23.68 3.31    Male     No   Sun  Dinner     2
..          ...  ...     ...    ...   ...     ...   ...
240       27.18 2.00  Female    Yes   Sat  Dinner     2
241       22.67 2.00    Male    Yes   Sat  Dinner     2
242       17.82 1.75    Male     No   Sat  Dinner     2
243       18.78 3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]
# DataFrame의 값들: ndarray
tips.values # or tips.to_numpy()
array([[16.99, 1.01, 'Female', ..., 'Sun', 'Dinner', 2],
       [10.34, 1.66, 'Male', ..., 'Sun', 'Dinner', 3],
       [21.01, 3.5, 'Male', ..., 'Sun', 'Dinner', 3],
       ...,
       [22.67, 2.0, 'Male', ..., 'Sat', 'Dinner', 2],
       [17.82, 1.75, 'Male', ..., 'Sat', 'Dinner', 2],
       [18.78, 3.0, 'Female', ..., 'Thur', 'Dinner', 2]], dtype=object)
tips.head() # 처음 N개 나열
   total_bill  tip     sex smoker  day    time  size
0       16.99 1.01  Female     No  Sun  Dinner     2
1       10.34 1.66    Male     No  Sun  Dinner     3
2       21.01 3.50    Male     No  Sun  Dinner     3
3       23.68 3.31    Male     No  Sun  Dinner     2
4       24.59 3.61  Female     No  Sun  Dinner     4
tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
tips.describe() # numerical type만 나열
       total_bill    tip   size
count      244.00 244.00 244.00
mean        19.79   3.00   2.57
std          8.90   1.38   0.95
min          3.07   1.00   1.00
25%         13.35   2.00   2.00
50%         17.80   2.90   2.00
75%         24.13   3.56   3.00
max         50.81  10.00   6.00
tips.describe(include="all") # all types 나열
        total_bill    tip   sex smoker  day    time   size
count       244.00 244.00   244    244  244     244 244.00
unique         NaN    NaN     2      2    4       2    NaN
top            NaN    NaN  Male     No  Sat  Dinner    NaN
freq           NaN    NaN   157    151   87     176    NaN
...            ...    ...   ...    ...  ...     ...    ...
25%          13.35   2.00   NaN    NaN  NaN     NaN   2.00
50%          17.80   2.90   NaN    NaN  NaN     NaN   2.00
75%          24.13   3.56   NaN    NaN  NaN     NaN   3.00
max          50.81  10.00   NaN    NaN  NaN     NaN   6.00

[11 rows x 7 columns]
tips.describe(include="category")
         sex smoker  day    time
count    244    244  244     244
unique     2      2    4       2
top     Male     No  Sat  Dinner
freq     157    151   87     176
s1 = tips.value_counts("day") # "day" 칼럼에 대한 각 카테고리별 counts
s2 = tips.value_counts("day", sort=False) # default: sort is true
s3 = tips.value_counts("day", ascending=True) # default: ascending is False
s4 = tips.value_counts("day", normalize=True) # 카테고리별 비율
s5 = tips.value_counts(["sex", "smoker"]) # "sex", "smoker" 칼럼에 대한 유니크한 카테고리별 counts
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
(a) s1
day
Thur    62
Fri     19
Sat     87
Sun     76
Name: count, dtype: int64
(b) s2
day
Fri     19
Thur    62
Sun     76
Sat     87
Name: count, dtype: int64
(c) s3
day
Sat    0.36
Sun    0.31
Thur   0.25
Fri    0.08
Name: proportion, dtype: float64
(d) s4
sex     smoker
Male    No        97
        Yes       60
Female  No        54
        Yes       33
Name: count, dtype: int64
(e) s5
Figure 1: value_count()의 arguments
Tip

.value_count()의 결과는 Series이며 그 이름은 ‘count’ 또는 ’proportion’임 (pandas 2.0)

Missing(NA)을 count하지 않으나 dropna=False을 이용해 나타낼 수 있음

tips.value_counts("day", dropna=False)

Series에 대해서도 적용되며, DataFrame으로 컬럼을 선택해 적용할 수 있음

tips["day"].value_counts()  # tips["day"]: Series object
tips[["sex", "smoker"]].value_counts()

Data: palmerpenguins

# load a dataset
penguins = sns.load_dataset("penguins")
penguins
    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen            39.1           18.7              181.0   
1    Adelie  Torgersen            39.5           17.4              186.0   
2    Adelie  Torgersen            40.3           18.0              195.0   
3    Adelie  Torgersen             NaN            NaN                NaN   
4    Adelie  Torgersen            36.7           19.3              193.0   
..      ...        ...             ...            ...                ...   
339  Gentoo     Biscoe             NaN            NaN                NaN   
340  Gentoo     Biscoe            46.8           14.3              215.0   
341  Gentoo     Biscoe            50.4           15.7              222.0   
342  Gentoo     Biscoe            45.2           14.8              212.0   
343  Gentoo     Biscoe            49.9           16.1              213.0   

     body_mass_g     sex  
0         3750.0    Male  
1         3800.0  Female  
2         3250.0  Female  
3            NaN     NaN  
4         3450.0  Female  
..           ...     ...  
339          NaN     NaN  
340       4850.0  Female  
341       5750.0    Male  
342       5200.0  Female  
343       5400.0    Male  

[344 rows x 7 columns]
penguins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB
penguins.describe(include="object")
       species  island   sex
count      344     344   333
unique       3       3     2
top     Adelie  Biscoe  Male
freq       152     168   168
penguins.value_counts(["island", "species"])
island     species  
Biscoe     Gentoo       124
Dream      Chinstrap     68
           Adelie        56
Torgersen  Adelie        52
Biscoe     Adelie        44
Name: count, dtype: int64
penguins.value_counts(["sex", "species"], dropna=False) # NA은 기본적으로 생략
sex     species  
Female  Adelie       73
Male    Adelie       73
        Gentoo       61
Female  Gentoo       58
        Chinstrap    34
Male    Chinstrap    34
NaN     Adelie        6
        Gentoo        5
Name: count, dtype: int64
# NA의 개수
penguins.isna().sum()
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64
# NA의 비율
penguins.isna().mean()
species              0.000000
island               0.000000
bill_length_mm       0.005814
bill_depth_mm        0.005814
flipper_length_mm    0.005814
body_mass_g          0.005814
sex                  0.031977
dtype: float64
tips.sort_values("tip", ascending=False)
     total_bill   tip     sex smoker  day    time  size
170       50.81 10.00    Male    Yes  Sat  Dinner     3
212       48.33  9.00    Male     No  Sat  Dinner     4
23        39.42  7.58    Male     No  Sat  Dinner     4
59        48.27  6.73    Male     No  Sat  Dinner     4
..          ...   ...     ...    ...  ...     ...   ...
236       12.60  1.00    Male    Yes  Sat  Dinner     2
111        7.25  1.00  Female     No  Sat  Dinner     1
67         3.07  1.00  Female    Yes  Sat  Dinner     1
92         5.75  1.00  Female    Yes  Fri  Dinner     2

[244 rows x 7 columns]
tips.sort_values(["size", "tip"], ascending=[False, True])
     total_bill  tip     sex smoker   day    time  size
125       29.80 4.20  Female     No  Thur   Lunch     6
143       27.05 5.00  Female     No  Thur   Lunch     6
156       48.17 5.00    Male     No   Sun  Dinner     6
141       34.30 6.70    Male     No  Thur   Lunch     6
..          ...  ...     ...    ...   ...     ...   ...
67         3.07 1.00  Female    Yes   Sat  Dinner     1
111        7.25 1.00  Female     No   Sat  Dinner     1
82        10.07 1.83  Female     No  Thur   Lunch     1
222        8.58 1.92    Male    Yes   Fri   Lunch     1

[244 rows x 7 columns]
tips.nlargest(3, "tip")  # 다수의 동등 순위가 있을 때 처리: keep="first", "last", "all"
     total_bill   tip   sex smoker  day    time  size
170       50.81 10.00  Male    Yes  Sat  Dinner     3
212       48.33  9.00  Male     No  Sat  Dinner     4
23        39.42  7.58  Male     No  Sat  Dinner     4