Transforming I

R for Data Science by Wickham & Grolemund

Author

Sungkyun Cho

Published

October 31, 2024

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

이번 장에서는 시각화를 하기 전후로, 중요한 데이터 패턴을 보기 위해서 새로운 변수를 만들거나 요약한 통계치를 만들 필요가 있는데 이를 다루는 핵심적인 함수들에 대해 익힙니다.

대략 다음과 같은 transform들을 조합하여 분석에 필요한 상태로 바꿉니다.


On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR) in 2013

# import the dataset
flights_data = sm.datasets.get_rdataset("flights", "nycflights13")
flights = flights_data.data
flights = flights.drop(columns="time_hour")  # drop the "time_hour" column
# Description
print(flights_data.__doc__)
flights
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 2013 1 1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 2013 1 1 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336773 2013 9 30 NaN 1210 NaN NaN 1330 NaN MQ 3461 N535MQ LGA BNA NaN 764 12 10
336774 2013 9 30 NaN 1159 NaN NaN 1344 NaN MQ 3572 N511MQ LGA CLE NaN 419 11 59
336775 2013 9 30 NaN 840 NaN NaN 1020 NaN MQ 3531 N839MQ LGA RDU NaN 431 8 40

336776 rows × 18 columns

flights.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   year            336776 non-null  int64  
 1   month           336776 non-null  int64  
 2   day             336776 non-null  int64  
 3   dep_time        328521 non-null  float64
 4   sched_dep_time  336776 non-null  int64  
 5   dep_delay       328521 non-null  float64
 6   arr_time        328063 non-null  float64
 7   sched_arr_time  336776 non-null  int64  
 8   arr_delay       327346 non-null  float64
 9   carrier         336776 non-null  object 
 10  flight          336776 non-null  int64  
 11  tailnum         334264 non-null  object 
 12  origin          336776 non-null  object 
 13  dest            336776 non-null  object 
 14  air_time        327346 non-null  float64
 15  distance        336776 non-null  int64  
 16  hour            336776 non-null  int64  
 17  minute          336776 non-null  int64  
dtypes: float64(5), int64(9), object(4)
memory usage: 46.2+ MB

Rows

query()

The query() Method 참조

Conditional operators
>,  >=,  <,  <=,
==  (equal to),  !=  (not equal to)
and, & (and)
or, | (or)
not, ~ (not)
in (includes), not in (not included)

# Flights that arrived more than 120 minutes (two hours) late
flights.query('arr_delay > 120')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
119 2013 1 1 811.00 630 101.00 1047.00 830 137.00 MQ 4576 N531MQ LGA CLT 118.00 544 6 30
151 2013 1 1 848.00 1835 853.00 1001.00 1950 851.00 MQ 3944 N942MQ JFK BWI 41.00 184 18 35
218 2013 1 1 957.00 733 144.00 1056.00 853 123.00 UA 856 N534UA EWR BOS 37.00 200 7 33
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336724 2013 9 30 2053.00 1815 158.00 2310.00 2054 136.00 EV 5292 N600QX EWR ATL 91.00 746 18 15
336757 2013 9 30 2159.00 1845 194.00 2344.00 2030 194.00 9E 3320 N906XJ JFK BUF 50.00 301 18 45
336763 2013 9 30 2235.00 2001 154.00 59.00 2249 130.00 B6 1083 N804JB JFK MCO 123.00 944 20 1

10034 rows × 18 columns

Note

외부 변수/함수를 참조하려면 @와 함께

delay_cutoff = 120
flights.query('arr_delay > @delay_cutoff')
def cut_off(df):
    return df["dep_delay"].min()

flights.query('arr_delay < @cut_off(@flights)')

위의 query 방식의 filtering은 다음과 같은 boolean indexing의 결과와 같음
flights[flights["arr_delay"] > 120]

# Flights that departed on January 1
flights.query('month == 1 and day == 1')  # == 과 = 을 혼동하지 말것!
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 2013 1 1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 2013 1 1 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
839 2013 1 1 NaN 1935 NaN NaN 2240 NaN AA 791 N3EHAA LGA DFW NaN 1389 19 35
840 2013 1 1 NaN 1500 NaN NaN 1825 NaN AA 1925 N3EVAA LGA MIA NaN 1096 15 0
841 2013 1 1 NaN 600 NaN NaN 901 NaN B6 125 N618JB JFK FLL NaN 1069 6 0

842 rows × 18 columns

# Flights that departed in January or February
flights.query('month == 1 or month == 2')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 2013 1 1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 2013 1 1 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
136244 2013 2 28 NaN 1115 NaN NaN 1310 NaN MQ 4485 N725MQ LGA CMH NaN 479 11 15
136245 2013 2 28 NaN 830 NaN NaN 1205 NaN UA 1480 NaN EWR SFO NaN 2565 8 30
136246 2013 2 28 NaN 840 NaN NaN 1147 NaN UA 443 NaN JFK LAX NaN 2475 8 40

51955 rows × 18 columns

# A shorter way to select flights that departed in January or February
flights.query('month in [1, 2]')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 2013 1 1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 2013 1 1 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
136244 2013 2 28 NaN 1115 NaN NaN 1310 NaN MQ 4485 N725MQ LGA CMH NaN 479 11 15
136245 2013 2 28 NaN 830 NaN NaN 1205 NaN UA 1480 NaN EWR SFO NaN 2565 8 30
136246 2013 2 28 NaN 840 NaN NaN 1147 NaN UA 443 NaN JFK LAX NaN 2475 8 40

51955 rows × 18 columns

flights.query('arr_delay > 120 and not (origin == "JFK")')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
119 2013 1 1 811.00 630 101.00 1047.00 830 137.00 MQ 4576 N531MQ LGA CLT 118.00 544 6 30
218 2013 1 1 957.00 733 144.00 1056.00 853 123.00 UA 856 N534UA EWR BOS 37.00 200 7 33
268 2013 1 1 1114.00 900 134.00 1447.00 1222 145.00 UA 1086 N76502 LGA IAH 248.00 1416 9 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336529 2013 9 30 1738.00 1529 129.00 1906.00 1649 137.00 EV 4580 N12563 EWR MKE 110.00 725 15 29
336668 2013 9 30 1951.00 1649 182.00 2157.00 1903 174.00 EV 4294 N13988 EWR SAV 95.00 708 16 49
336724 2013 9 30 2053.00 1815 158.00 2310.00 2054 136.00 EV 5292 N600QX EWR ATL 91.00 746 18 15

6868 rows × 18 columns

flights.query('dep_time < sched_dep_time')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
3 2013 1 1 544.00 545 -1.00 1004.00 1022 -18.00 B6 725 N804JB JFK BQN 183.00 1576 5 45
4 2013 1 1 554.00 600 -6.00 812.00 837 -25.00 DL 461 N668DN LGA ATL 116.00 762 6 0
5 2013 1 1 554.00 558 -4.00 740.00 728 12.00 UA 1696 N39463 EWR ORD 150.00 719 5 58
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336766 2013 9 30 2240.00 2250 -10.00 2347.00 7 -20.00 B6 2002 N281JB JFK BUF 52.00 301 22 50
336767 2013 9 30 2241.00 2246 -5.00 2345.00 1 -16.00 B6 486 N346JB JFK ROC 47.00 264 22 46
336769 2013 9 30 2349.00 2359 -10.00 325.00 350 -25.00 B6 745 N516JB JFK PSE 196.00 1617 23 59

184782 rows × 18 columns

flights.query('arr_delay + dep_delay < 0')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
3 2013 1 1 544.00 545 -1.00 1004.00 1022 -18.00 B6 725 N804JB JFK BQN 183.00 1576 5 45
4 2013 1 1 554.00 600 -6.00 812.00 837 -25.00 DL 461 N668DN LGA ATL 116.00 762 6 0
7 2013 1 1 557.00 600 -3.00 709.00 723 -14.00 EV 5708 N829AS LGA IAD 53.00 229 6 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336766 2013 9 30 2240.00 2250 -10.00 2347.00 7 -20.00 B6 2002 N281JB JFK BUF 52.00 301 22 50
336767 2013 9 30 2241.00 2246 -5.00 2345.00 1 -16.00 B6 486 N346JB JFK ROC 47.00 264 22 46
336769 2013 9 30 2349.00 2359 -10.00 325.00 350 -25.00 B6 745 N516JB JFK PSE 196.00 1617 23 59

188401 rows × 18 columns

flights.query('not dep_delay.isna() and arr_delay.isna()')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
471 2013 1 1 1525.00 1530 -5.00 1934.00 1805 NaN MQ 4525 N719MQ LGA XNA NaN 1147 15 30
477 2013 1 1 1528.00 1459 29.00 2002.00 1647 NaN EV 3806 N17108 EWR STL NaN 872 14 59
615 2013 1 1 1740.00 1745 -5.00 2158.00 2020 NaN MQ 4413 N739MQ LGA XNA NaN 1147 17 45
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334495 2013 9 28 1214.00 1225 -11.00 1801.00 1510 NaN AA 300 N488AA EWR DFW NaN 1372 12 25
335534 2013 9 29 1734.00 1711 23.00 2159.00 2020 NaN UA 327 N463UA EWR PDX NaN 2434 17 11
335805 2013 9 30 559.00 600 -1.00 NaN 715 NaN WN 464 N411WN EWR MDW NaN 711 6 0

1175 rows × 18 columns

delay_time = 900
flights.query('arr_delay > @delay_time')
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
7072 2013 1 9 641.00 900 1301.00 1242.00 1530 1272.00 HA 51 N384HA JFK HNL 640.00 4983 9 0
8239 2013 1 10 1121.00 1635 1126.00 1239.00 1810 1109.00 MQ 3695 N517MQ EWR ORD 111.00 719 16 35
151974 2013 3 17 2321.00 810 911.00 135.00 1020 915.00 DL 2119 N927DA LGA MSP 167.00 1020 8 10
173992 2013 4 10 1100.00 1900 960.00 1342.00 2211 931.00 DL 2391 N959DL JFK TPA 139.00 1005 19 0
235778 2013 6 15 1432.00 1935 1137.00 1607.00 2120 1127.00 MQ 3535 N504MQ JFK CMH 74.00 483 19 35
270376 2013 7 22 845.00 1600 1005.00 1044.00 1815 989.00 MQ 3075 N665MQ JFK CVG 96.00 589 16 0
327043 2013 9 20 1139.00 1845 1014.00 1457.00 2210 1007.00 AA 177 N338AA JFK SFO 354.00 2586 18 45
Note
  1. .query()의 결과는 view이므로 수정하려면 SettingWithCopyWarning; .copy() 후 사용 권장
  2. Python에서 유효하지 않은 변수명의 경우, 예를 들어, 빈칸이나 .이 있는 경우; income total, income.total
    Backtick으로 감싸줘야 함; `income total`, `income.total`, `class`
  3. Index값을 사용하여 query할 경우, 문서 참고
# df가 다음과 같을 때,
#      A     B
# 0 1.00  2.00
# 1  NaN  5.00
# 2 3.00  3.00
# 3 4.00   NaN

df.query('A > 1')
#      A     B
# 2 3.00  3.00
# 3 4.00   NaN

# NA를 포함하고자 할 때,
df.query('A > 1 | A.isna()') # .isna() : NA인지 여부
#      A     B
# 1  NaN  5.00
# 2 3.00  3.00
# 3 4.00   NaN

NA(missing)에 대해서는 뒤에서 자세히

sort_values()

# "year", "month", "day", "dep_time" 순서대로 내림차순으로 정렬
flights.sort_values(by=["year", "month", "day", "dep_time"], ascending=False)   # default: ascending=True
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
111279 2013 12 31 2356.00 2359 -3.00 436.00 445 -9.00 B6 745 N665JB JFK PSE 200.00 1617 23 59
111278 2013 12 31 2355.00 2359 -4.00 430.00 440 -10.00 B6 1503 N509JB JFK SJU 195.00 1598 23 59
111277 2013 12 31 2332.00 2245 47.00 58.00 3 55.00 B6 486 N334JB JFK ROC 60.00 264 22 45
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
839 2013 1 1 NaN 1935 NaN NaN 2240 NaN AA 791 N3EHAA LGA DFW NaN 1389 19 35
840 2013 1 1 NaN 1500 NaN NaN 1825 NaN AA 1925 N3EVAA LGA MIA NaN 1096 15 0
841 2013 1 1 NaN 600 NaN NaN 901 NaN B6 125 N618JB JFK FLL NaN 1069 6 0

336776 rows × 18 columns

# pd.DataFrame.nlargest()를 이용할 수도 있음
flights.nlargest(3, columns=["year", "month", "day", "dep_time"], keep="all")
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
111279 2013 12 31 2356.00 2359 -3.00 436.00 445 -9.00 B6 745 N665JB JFK PSE 200.00 1617 23 59
111278 2013 12 31 2355.00 2359 -4.00 430.00 440 -10.00 B6 1503 N509JB JFK SJU 195.00 1598 23 59
111277 2013 12 31 2332.00 2245 47.00 58.00 3 55.00 B6 486 N334JB JFK ROC 60.00 264 22 45
Tip

method의 설명이 잘 나타나지 않을 시, 함수를 직접 써서 확인
예를 들어,
pd.DataFrame.nlargest
pd.Series.nlargest

# "dep_time"은 내림차순으로, "arr_delay"는 오름차순으로
flights.sort_values(by=["dep_time", "arr_delay"], ascending=[False, True])
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
150301 2013 3 15 2400.00 2359 1.00 324.00 338 -14.00 B6 727 N636JB JFK BQN 186.00 1576 23 59
87893 2013 12 5 2400.00 2359 1.00 427.00 440 -13.00 B6 1503 N587JB JFK SJU 182.00 1598 23 59
212941 2013 5 21 2400.00 2359 1.00 339.00 350 -11.00 B6 739 N527JB JFK PSE 199.00 1617 23 59
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336773 2013 9 30 NaN 1210 NaN NaN 1330 NaN MQ 3461 N535MQ LGA BNA NaN 764 12 10
336774 2013 9 30 NaN 1159 NaN NaN 1344 NaN MQ 3572 N511MQ LGA CLE NaN 419 11 59
336775 2013 9 30 NaN 840 NaN NaN 1020 NaN MQ 3531 N839MQ LGA RDU NaN 431 8 40

336776 rows × 18 columns


query()sort_values()을 함께 이용하여 좀 더 복잡한 문제를 해결할 수 있음
예를 들어, 다음과 같이 거의 제시간에 출발한(+- 10분 이내) 항공편들 중 가장 도착 지연이 큰 항공편을 찾을 수 있음

# Method chaining
(
    flights
    .query('dep_delay <= 10 & dep_delay >= -10')
    .sort_values("arr_delay", ascending=False)
)
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
55985 2013 11 1 658.00 700 -2.00 1329.00 1015 194.00 VX 399 N629VA JFK LAX 336.00 2475 7 0
181270 2013 4 18 558.00 600 -2.00 1149.00 850 179.00 AA 707 N3EXAA LGA DFW 234.00 1389 6 0
256340 2013 7 7 1659.00 1700 -1.00 2050.00 1823 147.00 US 2183 N948UW LGA DCA 64.00 214 17 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
334354 2013 9 28 847.00 839 8.00 1130.00 959 NaN EV 4510 N14542 EWR MKE NaN 725 8 39
334412 2013 9 28 1010.00 1020 -10.00 1344.00 1222 NaN EV 4412 N12175 EWR DSM NaN 1017 10 20
335805 2013 9 30 559.00 600 -1.00 NaN 715 NaN WN 464 N411WN EWR MDW NaN 711 6 0

239109 rows × 18 columns

sort_values() 파라미터
  • Row가 재정렬되는 operation을 한 후에는 index 순서가 바뀌는데, 이를 reset하려면,
    .sort_values(ignore_index=True)
  • DataFrame updated: inplace=True
  • NA는 sort 후 맨 뒤로
    맨 앞으로 오게하려면 na_position='first'

unique()

Series method

flights["origin"].unique()  # return as a NumPy array, but depends on the dtypes
array(['EWR', 'LGA', 'JFK'], dtype=object)
flights["origin"].nunique()  # return the number of unique values
3
# finds all unique origin and destination pairs.
flights[["origin", "dest"]].value_counts(sort=False)  # default: dropna=True
origin  dest
EWR     ALB      439
        ANC        8
        ATL     5022
                ... 
LGA     TVC       77
        TYS      308
        XNA      745
Name: count, Length: 224, dtype: int64
flights[["origin", "dest"]].value_counts(sort=False).reset_index(name="n")
    origin dest     n
0      EWR  ALB   439
1      EWR  ANC     8
2      EWR  ATL  5022
..     ...  ...   ...
221    LGA  TVC    77
222    LGA  TYS   308
223    LGA  XNA   745

[224 rows x 3 columns]

Columns

Select

기본적인 column selection은 이전 섹션 참고: subsetting

# Select columns by name
flights[["year", "month", "day"]]
        year  month  day
0       2013      1    1
1       2013      1    1
2       2013      1    1
...      ...    ...  ...
336773  2013      9   30
336774  2013      9   30
336775  2013      9   30

[336776 rows x 3 columns]
# Select all columns between year and day (inclusive)
flights.loc[:, "year":"day"]
        year  month  day
0       2013      1    1
1       2013      1    1
2       2013      1    1
...      ...    ...  ...
336773  2013      9   30
336774  2013      9   30
336775  2013      9   30

[336776 rows x 3 columns]
# Select all columns except those from year to day (inclusive)
# .isin(): includes
flights.loc[:, ~flights.columns.isin(["year", "month", "day"])]  # Boolean indexing
dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336773 NaN 1210 NaN NaN 1330 NaN MQ 3461 N535MQ LGA BNA NaN 764 12 10
336774 NaN 1159 NaN NaN 1344 NaN MQ 3572 N511MQ LGA CLE NaN 419 11 59
336775 NaN 840 NaN NaN 1020 NaN MQ 3531 N839MQ LGA RDU NaN 431 8 40

336776 rows × 15 columns

cols = [col for col in flights.columns if col not in ["year", "month", "day"]]
flights[cols]
        dep_time  sched_dep_time  dep_delay  arr_time  sched_arr_time  \
0         517.00             515       2.00    830.00             819   
1         533.00             529       4.00    850.00             830   
2         542.00             540       2.00    923.00             850   
...          ...             ...        ...       ...             ...   
336773       NaN            1210        NaN       NaN            1330   
336774       NaN            1159        NaN       NaN            1344   
336775       NaN             840        NaN       NaN            1020   

        arr_delay carrier  flight tailnum origin dest  air_time  distance  \
0           11.00      UA    1545  N14228    EWR  IAH    227.00      1400   
1           20.00      UA    1714  N24211    LGA  IAH    227.00      1416   
2           33.00      AA    1141  N619AA    JFK  MIA    160.00      1089   
...           ...     ...     ...     ...    ...  ...       ...       ...   
336773        NaN      MQ    3461  N535MQ    LGA  BNA       NaN       764   
336774        NaN      MQ    3572  N511MQ    LGA  CLE       NaN       419   
336775        NaN      MQ    3531  N839MQ    LGA  RDU       NaN       431   

        hour  minute  
0          5      15  
1          5      29  
2          5      40  
...      ...     ...  
336773    12      10  
336774    11      59  
336775     8      40  

[336776 rows x 15 columns]


Series/Index object의 method와 함께 특정 string을 기준으로 선택 (string method와는 구별)
.str.contains(), .str.startswith(), .str.endswith() ; True/False

# Select all columns that begin with “dep”.
flights.loc[:, flights.columns.str.startswith("dep")]  # Boolean indexing
        dep_time  dep_delay
0         517.00       2.00
1         533.00       4.00
2         542.00       2.00
...          ...        ...
336773       NaN        NaN
336774       NaN        NaN
336775       NaN        NaN

[336776 rows x 2 columns]
# Select all columns that are characters
flights.select_dtypes("object")  # dtype: object, number, ...
       carrier tailnum origin dest
0           UA  N14228    EWR  IAH
1           UA  N24211    LGA  IAH
2           AA  N619AA    JFK  MIA
...        ...     ...    ...  ...
336773      MQ  N535MQ    LGA  BNA
336774      MQ  N511MQ    LGA  CLE
336775      MQ  N839MQ    LGA  RDU

[336776 rows x 4 columns]

index selection: reindex

rename()

pd.DataFrame.rename
flights.rename(
    columns={"dep_time": "dep_t", "arr_time": "arr_t"},  
    # 첫번째 인자 index=
    # inplace=True: dataframe is updated
)
year month day dep_t sched_dep_time dep_delay arr_t sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 2013 1 1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 2013 1 1 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
336773 2013 9 30 NaN 1210 NaN NaN 1330 NaN MQ 3461 N535MQ LGA BNA NaN 764 12 10
336774 2013 9 30 NaN 1159 NaN NaN 1344 NaN MQ 3572 N511MQ LGA CLE NaN 419 11 59
336775 2013 9 30 NaN 840 NaN NaN 1020 NaN MQ 3531 N839MQ LGA RDU NaN 431 8 40

336776 rows × 18 columns

Note

index를 rename할 수도 있음

flights.rename(
    index={0: "a", 1: "b"},
).head(3)
#    year  month  day  dep_time  ...
# a  2013      1    1    517.00  ...
# b  2013      1    1    533.00  ...
# 2  2013      1    1    542.00  ...
Tip

Series의 경우

s = flights.dep_delay.head(3)
# 0   2.00
# 1   4.00
# 2   2.00
# Name: dep_delay, dtype: float64

s.rename("what")
# 0   2.00
# 1   4.00
# 2   2.00
# Name: what, dtype: float64

함수 str.capitalize, str.lower, str.upper과 함께.

flights.rename(str.capitalize, axis="columns").head(3)  # axis=1
Year Month Day Dep_time Sched_dep_time Dep_delay Arr_time Sched_arr_time Arr_delay Carrier Flight Tailnum Origin Dest Air_time Distance Hour Minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
1 2013 1 1 533.00 529 4.00 850.00 830 20.00 UA 1714 N24211 LGA IAH 227.00 1416 5 29
2 2013 1 1 542.00 540 2.00 923.00 850 33.00 AA 1141 N619AA JFK MIA 160.00 1089 5 40
Tip

일반적으로 임의의 함수가 적용됨.

flights.rename(lambda x: x[:3], axis="columns")

#     yea  mon  day    dep  sch  dep    arr  sch   arr car   fli     tai  ori  ...
# 0  2013    1    1 517.00  515 2.00 830.00  819 11.00  UA  1545  N14228  EWR   
# 1  2013    1    1 533.00  529 4.00 850.00  830 20.00  UA  1714  N24211  LGA   
# 2  2013    1    1 542.00  540 2.00 923.00  850 33.00  AA  1141  N619AA  JFK  
# ...

assign()

cols = ["year", "month", "day", "distance", "air_time"] + \
        [col for col in flights.columns if col.endswith("delay")]  # string method .endswith
flights_sml = flights[cols].copy()
flights_sml
        year  month  day  distance  air_time  dep_delay  arr_delay
0       2013      1    1      1400    227.00       2.00      11.00
1       2013      1    1      1416    227.00       4.00      20.00
2       2013      1    1      1089    160.00       2.00      33.00
...      ...    ...  ...       ...       ...        ...        ...
336773  2013      9   30       764       NaN        NaN        NaN
336774  2013      9   30       419       NaN        NaN        NaN
336775  2013      9   30       431       NaN        NaN        NaN

[336776 rows x 7 columns]
# 새로 만들어진 변수는 맨 뒤로
flights_sml.assign(
    gain=lambda x: x.dep_delay - x.arr_delay,   # x: DataFrame, flights_sml
    speed=flights_sml["distance"] / flights_sml["air_time"] * 60    # 직접 DataFrame 참조할 수도 있음
)
        year  month  day  distance  air_time  dep_delay  arr_delay   gain  \
0       2013      1    1      1400    227.00       2.00      11.00  -9.00   
1       2013      1    1      1416    227.00       4.00      20.00 -16.00   
2       2013      1    1      1089    160.00       2.00      33.00 -31.00   
...      ...    ...  ...       ...       ...        ...        ...    ...   
336773  2013      9   30       764       NaN        NaN        NaN    NaN   
336774  2013      9   30       419       NaN        NaN        NaN    NaN   
336775  2013      9   30       431       NaN        NaN        NaN    NaN   

        speed  
0      370.04  
1      374.27  
2      408.38  
...       ...  
336773    NaN  
336774    NaN  
336775    NaN  

[336776 rows x 9 columns]
# 앞에서 만든 변수나 함수를 이용할 수 있음
flights_sml.assign(
    gain=lambda x: x.dep_delay - x.arr_delay,
    hours=lambda x: x.air_time / 60,
    gain_per_hour=lambda x: x.gain / x.hours,
    rounded=lambda x: np.round(x.gain_per_hour, 1)  # use a numpy function
)
        year  month  day  distance  air_time  dep_delay  arr_delay   gain  \
0       2013      1    1      1400    227.00       2.00      11.00  -9.00   
1       2013      1    1      1416    227.00       4.00      20.00 -16.00   
2       2013      1    1      1089    160.00       2.00      33.00 -31.00   
...      ...    ...  ...       ...       ...        ...        ...    ...   
336773  2013      9   30       764       NaN        NaN        NaN    NaN   
336774  2013      9   30       419       NaN        NaN        NaN    NaN   
336775  2013      9   30       431       NaN        NaN        NaN    NaN   

        hours  gain_per_hour  rounded  
0        3.78          -2.38    -2.40  
1        3.78          -4.23    -4.20  
2        2.67         -11.62   -11.60  
...       ...            ...      ...  
336773    NaN            NaN      NaN  
336774    NaN            NaN      NaN  
336775    NaN            NaN      NaN  

[336776 rows x 11 columns]
# Find the fastest flights
(
    flights_sml
    .assign(speed=lambda x: x.distance / x.air_time)
    .sort_values(by="speed", ascending=False)
    .head(5)
)
        year  month  day  distance  air_time  dep_delay  arr_delay  speed
216447  2013      5   25       762     65.00       9.00     -14.00  11.72
251999  2013      7    2      1008     93.00      45.00      26.00  10.84
205388  2013      5   13       594     55.00      15.00      -1.00  10.80
157516  2013      3   23       748     70.00       4.00       2.00  10.69
10223   2013      1   12      1035    105.00      -1.00     -28.00   9.86

Groups

groupby()

  • groupby()는 데이터를 의미있는 그룹으로 나누어 분석할 수 있도록 해줌
  • .count(), .sum(), .mean(), .min(), .max()과 같은 통계치를 구하는 methods와 함께 효과적으로, 자주 활용됨


Source: Ch.10 in Python for Data Analysis (3e) by Wes McKinney

아래 표는 groupby()와 함께 자주 쓰이는 효율적인 methods

Source: Ch.10 in Python for Data Analysis (3e) by Wes McKinney

flights.groupby("month")  # “GroupBy” object
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x157d8de10>
flights_sml.groupby("month").mean()
         year   day  distance  air_time  dep_delay  arr_delay
month                                                        
1     2013.00 15.99   1006.84    154.19      10.04       6.13
2     2013.00 14.74   1000.98    151.35      10.82       5.61
3     2013.00 16.00   1011.99    149.08      13.23       5.81
...       ...   ...       ...       ...        ...        ...
10    2013.00 15.98   1038.88    148.89       6.24      -0.17
11    2013.00 15.29   1050.31    155.47       5.44       0.46
12    2013.00 15.72   1064.66    162.59      16.58      14.87

[12 rows x 6 columns]
# 보통은 다음과 같이 특정 columns을 선택
flights.groupby("month")["dep_delay"]  # [["dep_delay"]] 처럼 list로 입력하면 DataFrameGroupBy object
<pandas.core.groupby.generic.SeriesGroupBy object at 0x157d88890>
flights.groupby("month")["dep_delay"].mean()  # Series GroupBy object에 적용된 결과는 Series
month
1    10.04
2    10.82
3    13.23
      ... 
10    6.24
11    5.44
12   16.58
Name: dep_delay, Length: 12, dtype: float64
flights.groupby("month")[["dep_delay", "arr_delay"]].mean().head(3)
       dep_delay  arr_delay
month                      
1          10.04       6.13
2          10.82       5.61
3          13.23       5.81
flights.groupby(["month", "day"])["arr_delay"].nsmallest(1)
month  day        
1      1    696      -48.00
       2    919      -59.00
       3    2035     -65.00
                      ...  
12     29   108914   -60.00
       30   110330   -45.00
       31   111113   -44.00
Name: arr_delay, Length: 365, dtype: float64
Tip

Multi-index의 level을 drop하려면 droplevel()

flights.groupby(["month", "day"])["arr_delay"].nsmallest(1).droplevel(2)
# month  day
# 1      1     -48.00
#        2     -59.00
#        3     -65.00
#               ...  
# 12     29    -60.00
#        30    -45.00
#        31    -44.00
# Name: arr_delay, Length: 365, dtype: float64
flights.groupby(["origin", "dest"])["arr_delay"].count()
origin  dest
EWR     ALB      418
        ANC        8
        ATL     4876
                ... 
LGA     TVC       73
        TYS      265
        XNA      709
Name: arr_delay, Length: 224, dtype: int64
Tip

as_index=False: grouping 변수들을 index가 아닌 columns으로

flights.groupby(["month", "day"], as_index=False)["arr_delay"].mean().head(3)
#    month  day  arr_delay
# 0      1    1      12.65
# 1      1    2      12.69
# 2      1    3       5.73

물론, 결과물에 .reset_index() method를 사용해도 됨

Note

원칙적으로 grouping은 같은 DataFrame의 변수일 필요없이 match만 되면 됨
Source: Wes McKinney’s

# df
#    key1  key2  data1  data2
# 0     a     1   0.36  -0.42
# 1     a     2  -1.51   0.04
# 2  None     1   0.75  -0.28
# 3     b     2   0.57   0.25
# 4     b     1   1.30  -0.77
# 5     a  <NA>  -0.53  -0.73
# 6  None     1   2.04  -0.37

states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

df["data1"].groupby([states, years]).mean()
# CA  2005   -1.02
#     2006    0.75
# OH  2005    0.46
#     2006    1.67
# Name: data1, dtype: float64
flights.groupby("dest").size()  # .size(): group의 사이즈/열의 갯수
dest
ABQ     254
ACK     265
ALB     439
       ... 
TVC     101
TYS     631
XNA    1036
Length: 105, dtype: int64
flights.groupby("tailnum", dropna=False).size()  # groupby는 기본적으로 NA 무시
tailnum
D942DN       4
N0EGMQ     371
N10156     153
          ... 
N999DN      61
N9EAMQ     248
NaN       2512
Length: 4044, dtype: int64
Note

.size().value_counts()와 유사하게 사용될 수 있음

flights["tailnum"].value_counts(dropna=False)
# NaN       2512
# N725MQ     575
# N722MQ     513
#           ... 
# N318AS       1
# N651UA       1
# N557AS       1
# Name: tailnum, Length: 4044, dtype: int64
Note

Index에 grouping하는 방식에 대해서는 Wes McKineey’s Chapter 10 참고

  • Grouping with Dictionaries and Series
  • Grouping with Functions
  • Grouping by Index Levels
Note

Time series 데이터의 경우 다양한 grouping 방식이 존재
Stephanie Molin의 ch.4 Working with time series data 참고

예를 들어, facebook stock에 대한 자료에서 2018년 4분기에 해당하는 날을 week단위로 그룹핑하여 volume을 다음과 같이 계산할 수 있음

fb.loc['2018-Q4'].groupby(pd.Grouper(freq='W')).volume.sum()
Tip

groupby filtering

flights.groupby(["year", "month", "day"]).filter(lambda x: x["arr_delay"].mean() < 0)

좀 더 복잡한 groupby filtering은 이후 apply를 이용


agg()

Aggregations: data transformation that produces scalar values from arrays

앞서 GroupBy object에 직접 stats function을 적용하였는데, agg()를 이용하여 더 확장, 일반화할 수 있음

# 모두 동일
flights_sml.groupby("month").mean()
flights_sml.groupby("month").agg("mean")  # function names
flights_sml.groupby("month").agg(np.mean)  # numpy functions
flights_sml.groupby("month").agg(lambda x: x.sum() / x.count())  # general functions; not optimized!
         year   day  distance  air_time  dep_delay  arr_delay
month                                                        
1     2013.00 15.99   1006.84    154.19      10.04       6.13
2     2013.00 14.74   1000.98    151.35      10.82       5.61
3     2013.00 16.00   1011.99    149.08      13.23       5.81
...       ...   ...       ...       ...        ...        ...
10    2013.00 15.98   1038.88    148.89       6.24      -0.17
11    2013.00 15.29   1050.31    155.47       5.44       0.46
12    2013.00 15.72   1064.66    162.59      16.58      14.87

[12 rows x 6 columns]
flights_sml.groupby("month")["arr_delay"].agg("mean")  # Series에 string으로 함수 적용
month
1     6.13
2     5.61
3     5.81
      ... 
10   -0.17
11    0.46
12   14.87
Name: arr_delay, Length: 12, dtype: float64
Tip

차이 참고

flights_sml.groupby("month")["arr_delay"].agg(["mean"])  # list로 함수를 입력하는 경우
flights_sml.groupby("month")[["arr_delay"]].agg("mean")  # DataFrame에 적용될 때
#        mean                    arr_delay
# month                 # month
# 1      6.13           # 1           6.13
# 2      5.61           # 2           5.61
# 3      5.81           # 3           5.81
# ...                   # ...
flights.groupby("month")["dep_delay"].agg(["mean", "count"])
       mean  count
month             
1     10.04  26483
2     10.82  23690
3     13.23  27973
...     ...    ...
10     6.24  28653
11     5.44  27035
12    16.58  27110

[12 rows x 2 columns]
flights_sml.groupby("month")[["arr_delay"]].agg(["mean", "median"])
      arr_delay       
           mean median
month                 
1          6.13  -3.00
2          5.61  -3.00
3          5.81  -6.00
...         ...    ...
10        -0.17  -7.00
11         0.46  -6.00
12        14.87   2.00

[12 rows x 2 columns]
# DataFrame Groupby object의 경우 column 별로 적용
flights_sml.groupby("month")[["arr_delay", "dep_delay"]].agg("mean")
       arr_delay  dep_delay
month                      
1           6.13      10.04
2           5.61      10.82
3           5.81      13.23
...          ...        ...
10         -0.17       6.24
11          0.46       5.44
12         14.87      16.58

[12 rows x 2 columns]
flights_sml.groupby("month")[["arr_delay", "dep_delay"]].agg(["mean", "count"])
      arr_delay        dep_delay       
           mean  count      mean  count
month                                  
1          6.13  26398     10.04  26483
2          5.61  23611     10.82  23690
3          5.81  27902     13.23  27973
...         ...    ...       ...    ...
10        -0.17  28618      6.24  28653
11         0.46  26971      5.44  27035
12        14.87  27020     16.58  27110

[12 rows x 4 columns]

컬럼 별로 다른 함수를 적용하고자 할 때 (컬럼 이름을 지정): tuple

flights_agg = (
    flights.groupby("month")
    .agg(
        air_min=("air_time", "min"),  # 컬럼 이름 지정 = (컬럼, 함수)
        air_max=("air_time", "max"),
        dep_mean=("dep_delay", "mean"),
        arr_median=("arr_delay", "median"),
    )
)

flights_agg
       air_min  air_max  dep_mean  arr_median
month                                        
1        20.00   667.00     10.04       -3.00
2        21.00   691.00     10.82       -3.00
3        21.00   695.00     13.23       -6.00
...        ...      ...       ...         ...
10       23.00   642.00      6.24       -7.00
11       24.00   676.00      5.44       -6.00
12       21.00   661.00     16.58        2.00

[12 rows x 4 columns]
Note

컬럼별 다른 함수를 적용하는 다른 방식: dictionary

flights_agg = flights.groupby("month").agg({
    "air_time": ["min", "max"],
    "dep_delay": "mean",
    "arr_delay": "median"
})
#       air_time        dep_delay arr_delay
#            min    max      mean    median
# month                                    
# 1        20.00 667.00     10.04     -3.00
# 2        21.00 691.00     10.82     -3.00
# 3        21.00 695.00     13.23     -6.00
# ...

MultiIndex를 collapse하는 팁

flights_agg.columns
# MultiIndex([( 'air_time',    'min'),
#             ( 'air_time',    'max'),
#             ('dep_delay',   'mean'),
#             ('arr_delay', 'median')],
#            )

flights_agg.columns = ['_'.join(col_agg) for col_agg in flights_agg.columns]
flights_agg.head(3)
#        air_time_min  air_time_max  dep_delay_mean  arr_delay_median
# month                                                              
# 1             20.00        667.00           10.04             -3.00
# 2             21.00        691.00           10.82             -3.00
# 3             21.00        695.00           13.23             -6.00
Note

agg()에는 custom function을 pass할 수 있음
단, the optimized functions (Table 10-1)에 비해 일반적으로 훨씬 느림

def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped_dist = flights_sml.groupby(["month", "day"])["distance"]

grouped_dist.agg(["std", peak_to_peak])  # a list of functions
             std  peak_to_peak
month day                     
1     1   727.73          4889
      2   721.72          4889
      3   714.95          4903
...          ...           ...
12    29  728.78          4887
      30  723.88          4887
      31  731.36          4887

[365 rows x 2 columns]
# Naming a function as a tuple
grouped_dist.agg([("sd", "std"), ("range", peak_to_peak)])
              sd  range
month day              
1     1   727.73   4889
      2   721.72   4889
      3   714.95   4903
...          ...    ...
12    29  728.78   4887
      30  723.88   4887
      31  731.36   4887

[365 rows x 2 columns]
Note

.describe()는 aggregation은 아니나 grouped objects에 적용가능

flights_sml.groupby("month")[["dep_delay", "arr_delay"]].describe()

#       dep_delay                                              arr_delay        \
#           count  mean   std    min   25%   50%   75%     max     count  mean   
# month                                                                          
# 1      26483.00 10.04 36.39 -30.00 -5.00 -2.00  8.00 1301.00  26398.00  6.13   
# 2      23690.00 10.82 36.27 -33.00 -5.00 -2.00  9.00  853.00  23611.00  5.61   
# 3      27973.00 13.23 40.13 -25.00 -5.00 -1.00 12.00  911.00  27902.00  5.81   
# ...         ...   ...   ...    ...   ...   ...   ...     ...       ...   ...    
                                               
#         std    min    25%   50%   75%     max  
# month                                          
# 1     40.42 -70.00 -15.00 -3.00 13.00 1272.00  
# 2     39.53 -70.00 -15.00 -3.00 13.00  834.00  
# 3     44.12 -68.00 -18.00 -6.00 13.00  915.00  
# ...     ...    ...    ...   ...   ...     ...  

# [12 rows x 16 columns]

transform()

앞서 group별로 통계치가 summary되어 원래 reduced 데이터프레임으로 변형됐다면,
transform()은 group별로 얻은 통계치가 원래 데이터의 형태를 그대로 보존하면서 출력
만약, 전달되는 함수가 Series를 반환하려면, 동일한 사이즈로 반환되어야 함.

flights_sml.groupby(["month"])["arr_delay"].mean()
month
1     6.13
2     5.61
3     5.81
      ... 
10   -0.17
11    0.46
12   14.87
Name: arr_delay, Length: 12, dtype: float64
grouped_delay = flights_sml.groupby(["month"])["arr_delay"].transform("mean")
grouped_delay
0         6.13
1         6.13
2         6.13
          ... 
336773   -4.02
336774   -4.02
336775   -4.02
Name: arr_delay, Length: 336776, dtype: float64
flights_sml["monthly_delay"] = grouped_delay
flights_sml
        year  month  day  distance  air_time  dep_delay  arr_delay  \
0       2013      1    1      1400    227.00       2.00      11.00   
1       2013      1    1      1416    227.00       4.00      20.00   
2       2013      1    1      1089    160.00       2.00      33.00   
...      ...    ...  ...       ...       ...        ...        ...   
336773  2013      9   30       764       NaN        NaN        NaN   
336774  2013      9   30       419       NaN        NaN        NaN   
336775  2013      9   30       431       NaN        NaN        NaN   

        monthly_delay  
0                6.13  
1                6.13  
2                6.13  
...               ...  
336773          -4.02  
336774          -4.02  
336775          -4.02  

[336776 rows x 8 columns]

Q: 1년에 10000편 이상 운항편이 있는 도착지로 가는 항공편들만 추리면,

dest_size =  flights.groupby("dest").transform("size")
dest_size

# 또는 flights.groupby("dest")["dest"].transform("count")
0          7198
1          7198
2         11728
          ...  
336773     6333
336774     4573
336775     8163
Length: 336776, dtype: int64
# 1년에 10000편 이상 운항편이 있는 도착지에 대한 항공편
flights[dest_size >= 10000]
        year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
2       2013      1    1    542.00             540       2.00    923.00   
4       2013      1    1    554.00             600      -6.00    812.00   
5       2013      1    1    554.00             558      -4.00    740.00   
...      ...    ...  ...       ...             ...        ...       ...   
336762  2013      9   30   2233.00            2113      80.00    112.00   
336763  2013      9   30   2235.00            2001     154.00     59.00   
336768  2013      9   30   2307.00            2255      12.00   2359.00   

        sched_arr_time  arr_delay carrier  flight tailnum origin dest  \
2                  850      33.00      AA    1141  N619AA    JFK  MIA   
4                  837     -25.00      DL     461  N668DN    LGA  ATL   
5                  728      12.00      UA    1696  N39463    EWR  ORD   
...                ...        ...     ...     ...     ...    ...  ...   
336762              30      42.00      UA     471  N578UA    EWR  SFO   
336763            2249     130.00      B6    1083  N804JB    JFK  MCO   
336768            2358       1.00      B6     718  N565JB    JFK  BOS   

        air_time  distance  hour  minute  
2         160.00      1089     5      40  
4         116.00       762     6       0  
5         150.00       719     5      58  
...          ...       ...   ...     ...  
336762    318.00      2565    21      13  
336763    123.00       944    20       1  
336768     33.00       187    22      55  

[131440 rows x 18 columns]

Q: 하루 중 출발 지연이 가장 늦은 두 항공편들을 매일 각각 구하면,

def get_ranks(group):
    return group.rank(ascending=False, method="min") # method: 동일 등수에 대한 처리방식

delay_rank = flights.groupby(["month", "day"])["dep_delay"].transform(get_ranks)
# 또는 .transform("rank", ascending=False, method="min")  # using "rank" 함수

delay_rank
0        302.00
1        269.00
2        302.00
          ...  
336773      NaN
336774      NaN
336775      NaN
Name: dep_delay, Length: 336776, dtype: float64
flights[delay_rank < 3].head(6)
      year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
151   2013      1    1    848.00            1835     853.00   1001.00   
834   2013      1    1   2343.00            1724     379.00    314.00   
1440  2013      1    2   1607.00            1030     337.00   2003.00   
1749  2013      1    2   2131.00            1512     379.00   2340.00   
2598  2013      1    3   2008.00            1540     268.00   2339.00   
2637  2013      1    3   2056.00            1605     291.00   2239.00   

      sched_arr_time  arr_delay carrier  flight tailnum origin dest  air_time  \
151             1950     851.00      MQ    3944  N942MQ    JFK  BWI     41.00   
834             1938     456.00      EV    4321  N21197    EWR  MCI    222.00   
1440            1355     368.00      AA     179  N324AA    JFK  SFO    346.00   
1749            1741     359.00      UA     488  N593UA    LGA  DEN    228.00   
2598            1909     270.00      DL    2027  N338NW    JFK  FLL    158.00   
2637            1754     285.00      9E    3459  N928XJ    JFK  BNA    125.00   

      distance  hour  minute  
151        184    18      35  
834       1092    17      24  
1440      2586    10      30  
1749      1620    15      12  
2598      1069    15      40  
2637       765    16       5  

Q: Standarize air time by destination

dest_air = flights.groupby("dest")["air_time"]
# Z = (x - mean) / std
(flights['air_time'] - dest_air.transform('mean')) / dest_air.transform('std')
0        1.73
1        1.73
2        0.61
         ... 
336773    NaN
336774    NaN
336775    NaN
Name: air_time, Length: 336776, dtype: float64
def standardize(x):
    return (x - x.mean()) / x.std()
dest_air.transform(standardize)
0        1.73
1        1.73
2        0.61
         ... 
336773    NaN
336774    NaN
336775    NaN
Name: air_time, Length: 336776, dtype: float64

apply()

Apply: General split-apply-combine in McKineey’s Chapter 10.3 참고

The most general-purpose GroupBy method is apply, which is the subject of this section. apply splits the object being manipulated into pieces, invokes the passed function on each piece, and then attempts to concatenate the pieces.

tips = sns.load_dataset("tips")
tips = tips.assign(tip_pct = lambda x: x.tip / x.total_bill)
tips.head(3)
   total_bill  tip     sex smoker  day    time  size  tip_pct
0       16.99 1.01  Female     No  Sun  Dinner     2     0.06
1       10.34 1.66    Male     No  Sun  Dinner     3     0.16
2       21.01 3.50    Male     No  Sun  Dinner     3     0.17
def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]  # DataFrame을 반환
top(tips, n=4)
     total_bill  tip     sex smoker  day    time  size  tip_pct
172        7.25 5.15    Male    Yes  Sun  Dinner     2     0.71
178        9.60 4.00  Female    Yes  Sun  Dinner     2     0.42
67         3.07 1.00  Female    Yes  Sat  Dinner     1     0.33
232       11.61 3.39    Male     No  Sat  Dinner     2     0.29
# DataFrame Groupby ojbect에 적용
tips.groupby("time", observed=False).apply(top)
            total_bill  tip     sex smoker   day    time  size  tip_pct
time                                                                   
Lunch  149        7.51 2.00    Male     No  Thur   Lunch     2     0.27
       221       13.42 3.48  Female    Yes   Fri   Lunch     2     0.26
       194       16.58 4.00    Male    Yes  Thur   Lunch     2     0.24
...                ...  ...     ...    ...   ...     ...   ...      ...
Dinner 67         3.07 1.00  Female    Yes   Sat  Dinner     1     0.33
       232       11.61 3.39    Male     No   Sat  Dinner     2     0.29
       183       23.17 6.50    Male    Yes   Sun  Dinner     4     0.28

[10 rows x 8 columns]
tips.groupby(["time", "day"], observed=False).apply(top, n=1, column="total_bill")
                 total_bill   tip     sex smoker   day    time  size  tip_pct
time   day                                                                   
Lunch  Thur 197       43.11  5.00  Female    Yes  Thur   Lunch     4     0.12
       Fri  225       16.27  2.50  Female    Yes   Fri   Lunch     2     0.15
Dinner Thur 243       18.78  3.00  Female     No  Thur  Dinner     2     0.16
       Fri  95        40.17  4.73    Male    Yes   Fri  Dinner     4     0.12
       Sat  170       50.81 10.00    Male    Yes   Sat  Dinner     3     0.20
       Sun  156       48.17  5.00    Male     No   Sun  Dinner     6     0.10

GroupBy안에서 describe()와 같은 method를 적용하면, 사실 다음과 같은 shortcut임

def f(group):
    return group.describe()
tips.groupby("sex", observed=False)["tip_pct"].describe().T  # T: transpose
sex     Male  Female
count 157.00   87.00
mean    0.16    0.17
std     0.06    0.05
...      ...     ...
50%     0.15    0.16
75%     0.19    0.19
max     0.71    0.42

[8 rows x 2 columns]
def mean2(group):
    return group.mean()  # Series를 반환!
tips.groupby("time", observed=False)[["total_bill", "tip"]].apply(mean2)
        total_bill  tip
time                   
Lunch        17.17 2.73
Dinner       20.80 3.10
Important

적용되는 함수는 pandas object나 scalar value를 반환해야 함

Note

위에서 agg()에 custom function을 pass할 수 있지만, 일반적으로 훨씬 느리다고 했는데,
이는 numeric array에 apply() method를 사용할 때에도 해당됨.
가능하다면, apply()를 피하는 것이 실행 속도를 크게 높일 수 있음. (단, “string”에 적용될 때는 차이 없음)

Suppressing the Group Keys
tips.groupby("time", group_keys=False).apply(top)
#      total_bill  tip     sex smoker   day    time  size  tip_pct
# 149        7.51 2.00    Male     No  Thur   Lunch     2     0.27
# 221       13.42 3.48  Female    Yes   Fri   Lunch     2     0.26
# 194       16.58 4.00    Male    Yes  Thur   Lunch     2     0.24
# ...
Hide warnings
import warnings
warnings.filterwarnings("ignore")

pandas method 비교

  • applymap: element-wise, DataFrame method
  • map: element-wise, Series method
  • apply: column/row-wise, DataFrame method, 또는 element-wise, Series method

Source, 비교 참고

def my_format(x):
    return f"{x:.1f}"
tips_num = tips.select_dtypes("number")  # select numeric columns

# element-wise, DataFrame method
tips_num.applymap(my_format)
    total_bill  tip size tip_pct
0         17.0  1.0  2.0     0.1
1         10.3  1.7  3.0     0.2
2         21.0  3.5  3.0     0.2
..         ...  ...  ...     ...
241       22.7  2.0  2.0     0.1
242       17.8  1.8  2.0     0.1
243       18.8  3.0  2.0     0.2

[244 rows x 4 columns]
# element-wise, Series method
tips["tip"].map(my_format)
0      1.0
1      1.7
2      3.5
      ... 
241    2.0
242    1.8
243    3.0
Name: tip, Length: 244, dtype: object
def peak_to_peak(arr):
    return arr.max() - arr.min()
# column-wise operation
tips_num.apply(peak_to_peak)

# row-wise operation
tips_num.apply(peak_to_peak, axis="columns")
total_bill   47.74
tip           9.00
size          5.00
tip_pct       0.67
dtype: float64
0     16.93
1     10.18
2     20.84
       ... 
241   22.58
242   17.72
243   18.62
Length: 244, dtype: float64

Groupby object에 적용할 때 비교

apply에 패스되는 함수가 Series나 DataFrame를 반환하는 경우

def f2(x):
    return pd.Series([x.min(), x.max()], index=["min", "max"])  # Series를 반환
# apply에 패스되는 함수가 Series를 반환하는 경우
tips_num.apply(f2)
     total_bill   tip  size  tip_pct
min        3.07  1.00     1     0.04
max       50.81 10.00     6     0.71
# Series Groupby object의 경우
tips.groupby("time", observed=False)["tip"].apply(f2)
time       
Lunch   min    1.25
        max    6.70
Dinner  min    1.00
        max   10.00
Name: tip, dtype: float64
# DataFrame GroupBy object의 경우
def f3(g):
    x = g["tip"]
    return pd.Series([x.min(), x.max()], index=["min", "max"])

tips.groupby("time", observed=False).apply(f3)
        min   max
time             
Lunch  1.25  6.70
Dinner 1.00 10.00

Groupby filtering

앞서 다음과 같이 .filter()를 이용하여 groupby filtering을 했는데, apply를 이용하여 좀 더 복잡한 filtering을 할 수 있음

flights.groupby(["year", "month", "day"]).filter(lambda x: x["arr_delay"].mean() < 0)

다음과 같은 방식으로 apply를 이용하여 filtering하면 위와 동일함.

def less_than(group, t):
    condition = group["arr_delay"].mean() < t
    return group if condition else None
    
flights.groupby(["year", "month", "day"]).apply(less_than, 0)

가령, 도착지연이 그날 하루 평균보다 낮았던 항공편들만 추리기

def less_than_mean(group):
    idx = group["arr_delay"] < group["arr_delay"].mean()
    return group[idx]

flights.groupby(["year", "month", "day"]).apply(less_than_mean)
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
year month day
2013 1 1 0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
3 2013 1 1 544.00 545 -1.00 1004.00 1022 -18.00 B6 725 N804JB JFK BQN 183.00 1576 5 45
4 2013 1 1 554.00 600 -6.00 812.00 837 -25.00 DL 461 N668DN LGA ATL 116.00 762 6 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12 31 111276 2013 12 31 2328.00 2330 -2.00 412.00 409 3.00 B6 1389 N651JB EWR SJU 198.00 1608 23 30
111278 2013 12 31 2355.00 2359 -4.00 430.00 440 -10.00 B6 1503 N509JB JFK SJU 195.00 1598 23 59
111279 2013 12 31 2356.00 2359 -3.00 436.00 445 -9.00 B6 745 N665JB JFK PSE 200.00 1617 23 59

207909 rows × 18 columns

grouping 변수를 제거하고 깔끔하게 하려면; group_keys=False

flights.groupby(["year", "month", "day"], group_keys=False).apply(less_than_mean)
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
0 2013 1 1 517.00 515 2.00 830.00 819 11.00 UA 1545 N14228 EWR IAH 227.00 1400 5 15
3 2013 1 1 544.00 545 -1.00 1004.00 1022 -18.00 B6 725 N804JB JFK BQN 183.00 1576 5 45
4 2013 1 1 554.00 600 -6.00 812.00 837 -25.00 DL 461 N668DN LGA ATL 116.00 762 6 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
111276 2013 12 31 2328.00 2330 -2.00 412.00 409 3.00 B6 1389 N651JB EWR SJU 198.00 1608 23 30
111278 2013 12 31 2355.00 2359 -4.00 430.00 440 -10.00 B6 1503 N509JB JFK SJU 195.00 1598 23 59
111279 2013 12 31 2356.00 2359 -3.00 436.00 445 -9.00 B6 745 N665JB JFK PSE 200.00 1617 23 59

207909 rows × 18 columns

Custom methods

다음과 같이 pandas object에 custom method를 추가할 수 있음

import pandas as pd

def select_cols(self, *cols):
    """
    Select a subset of columns from the DataFrame.

    Parameters:
    - cols (tuple of str): Column names to select. Use a colon (:) to specify a range of columns.

    Returns:
    - pd.DataFrame: The DataFrame with selected columns.

    Example:
    - tips.select("tip", "day:size")
    """
    selected_cols = []
    for col in cols:
        if ":" in col:
            start, end = col.split(":")
            columns = self.loc[:, start:end].columns
            selected_cols.extend(columns)
        else:
            selected_cols.append(col)

    return self[selected_cols]

# Add the custom method to the DataFrame class
pd.DataFrame.select = select_cols


flights.select("origin", "dest", "dep_delay:arr_delay").head()
#   origin dest  dep_delay  arr_time  sched_arr_time  arr_delay
# 0    EWR  IAH       2.00    830.00             819      11.00
# 1    LGA  IAH       4.00    850.00             830      20.00
# 2    JFK  MIA       2.00    923.00             850      33.00
# 3    JFK  BQN      -1.00   1004.00            1022     -18.00
# 4    LGA  ATL      -6.00    812.00             837     -25.00

# Build useful methods
flights.deselect()
flights.relocate()