# numerical calculation & data framesimport numpy as npimport pandas as pd# visualizationimport matplotlib.pyplot as pltimport seaborn as snsimport seaborn.objects as so# statisticsimport statsmodels.api as sm# pandas optionspd.set_option('mode.copy_on_write', True) # pandas 2.0pd.options.display.float_format ='{:.2f}'.format# pd.reset_option('display.float_format')pd.options.display.max_rows =7# max number of rows to display# NumPy optionsnp.set_printoptions(precision =2, suppress=True) # suppress scientific notation# For high resolution displayimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats("retina")
Data: On-time data for all flights that departed NYC (i.e. JFK, LGA or EWR) in 2013
# import the datasetflights_data = sm.datasets.get_rdataset("flights", "nycflights13")flights = flights_data.dataflights = flights.drop(columns="time_hour") # drop the "time_hour" column
# Descriptionprint(flights_data.__doc__)
flights
year month day dep_time sched_dep_time dep_delay arr_time \
0 2013 1 1 517.00 515 2.00 830.00
1 2013 1 1 533.00 529 4.00 850.00
2 2013 1 1 542.00 540 2.00 923.00
... ... ... ... ... ... ... ...
336773 2013 9 30 NaN 1210 NaN NaN
336774 2013 9 30 NaN 1159 NaN NaN
336775 2013 9 30 NaN 840 NaN NaN
sched_arr_time arr_delay carrier flight tailnum origin dest \
0 819 11.00 UA 1545 N14228 EWR IAH
1 830 20.00 UA 1714 N24211 LGA IAH
2 850 33.00 AA 1141 N619AA JFK MIA
... ... ... ... ... ... ... ...
336773 1330 NaN MQ 3461 N535MQ LGA BNA
336774 1344 NaN MQ 3572 N511MQ LGA CLE
336775 1020 NaN MQ 3531 N839MQ LGA RDU
air_time distance hour minute
0 227.00 1400 5 15
1 227.00 1416 5 29
2 160.00 1089 5 40
... ... ... ... ...
336773 NaN 764 12 10
336774 NaN 419 11 59
336775 NaN 431 8 40
[336776 rows x 18 columns]
# Select a column with the boolean indexingflights_6[idx]["dest"]
8 MCO
82 SFO
63 LAX
Name: dest, dtype: object
Note
사실, boolean indexing을 할때, DataFrame/Series의 index와 match함
대부분 염려하지 않아도 되나 다음과 같은 결과 참고
# Reset indexidx_reset = idx.reset_index(drop=True)# 0 True# 1 False# 2 True# 3 False# 4 True# 5 False# Name: dep_delay, dtype: boolflights_6[idx_reset]["dest"]#> IndexingError: Unalignable boolean Series provided as indexer #> (index of the boolean Series and of the indexed object do not match)# Index가 없는 numpy array로 boolean indexing을 하는 경우 문제없음flights_6[idx_reset.to_numpy()]["dest"]# 8 MCO# 82 SFO# 63 LAX# Name: dest, dtype: object
# Columns 쪽으로 boolean indexingflights.loc[:, cols]
dep_time sched_dep_time arr_time sched_arr_time air_time
0 517.00 515 830.00 819 227.00
1 533.00 529 850.00 830 227.00
2 542.00 540 923.00 850 160.00
... ... ... ... ... ...
336773 NaN 1210 NaN 1330 NaN
336774 NaN 1159 NaN 1344 NaN
336775 NaN 840 NaN 1020 NaN
[336776 rows x 5 columns]
Warning
Chained indexing으로 값을 assign하는 경우 copy vs. view 경고 메세지
flights[flights["arr_delay"] <0]["arr_delay"] =0
/var/folders/mp/vcywncl97ml2q4c_5k2r573m0000gn/T/ipykernel_96692/3780864177.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy