Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
# statistics
import statsmodels.api as sm
import statsmodels.formula.api as smf
# pandas options
pd.set_option('mode.copy_on_write' , True ) # pandas 2.0
pd.options.display.float_format = ' {:.2f} ' .format # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7 # max number of rows to display
# NumPy options
np.set_printoptions(precision = 2 , suppress= True ) # suppress scientific notation
# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina" )
Bike Sharing Demand
Bike Sharing in Washington D.C. Dataset
bikeshare = pd.read_csv("../data/hour.csv" )
bikeshare_daily = pd.read_csv("../data/day.csv" )
def clean_data(df):
df.rename({"dteday" : "date" , "cnt" : "count" }, axis= 1 , inplace= True )
df = df.assign(
date= lambda x: pd.to_datetime(x["date" ]), # datetime type으로 변환
year= lambda x: x["date" ].dt.year.astype(str ), # year 추출
day= lambda x: x["date" ].dt.day_of_year, # day of the year 추출
month= lambda x: x["date" ].dt.month_name().str [:3 ], # month 추출
wday= lambda x: x["date" ].dt.day_name().str [:3 ], # 요일 추출
)
df["season" ] = (
df["season" ]
.map ({1 : "winter" , 2 : "spring" , 3 : "summer" , 4 : "fall" }) # season을 문자열로 변환
.astype("category" ) # category type으로 변환
.cat.set_categories(
["winter" , "spring" , "summer" , "fall" ], ordered= True
) # 순서를 지정
)
return df
bikes = clean_data(bikeshare)
bikes_daily = clean_data(bikeshare_daily)
instant date season yr mnth holiday weekday workingday \
0 1 2011-01-01 winter 0 1 0 6 0
1 2 2011-01-02 winter 0 1 0 0 0
2 3 2011-01-03 winter 0 1 0 1 1
weathersit temp atemp hum windspeed casual registered count year \
0 2 0.34 0.36 0.81 0.16 331 654 985 2011
1 2 0.36 0.35 0.70 0.25 131 670 801 2011
2 1 0.20 0.19 0.44 0.25 120 1229 1349 2011
day month wday
0 1 Jan Sat
1 2 Jan Sun
2 3 Jan Mon
(
so.Plot(bikes_daily, x= 'date' )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.layout(size= (8 , 6 ))
)
(
so.Plot(bikes_daily, x= 'date' )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.Spline(10 ))
.layout(size= (8 , 6 ))
)
(
so.Plot(bikes_daily, x= 'day' , color= "year" )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.layout(size= (7 , 7 ))
)
(
so.Plot(bikes_daily, x= 'day' , color= "year" )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.Spline(5 ))
.layout(size= (7 , 7 ))
)
(
so.Plot(bikes_daily, x= 'day' , y= 'temp' , color= "year" )
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
)
# 요일에 따라 차이가 있는가?
from sbcustom import boxplot
(
boxplot(bikes_daily, x= "wday" , y= "casual" )
.facet("year" )
.layout(size= (8 , 4 ))
)
(
boxplot(bikes_daily, x= "wday" , y= "registered" )
.facet("year" )
.layout(size= (8 , 4 ))
)
# 달에 따라 차이가 있는가?
(
boxplot(bikes_daily, x= "month" , y= "casual" )
.facet("year" )
.layout(size= (8 , 4 ))
)
(
boxplot(bikes_daily, x= "month" , y= "registered" )
.facet("year" )
.layout(size= (8 , 4 ))
)
# 기온에 따라 차이가 있는가?
(
so.Plot(bikes_daily, x= 'temp' )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.facet("year" )
.layout(size= (9 , 7 ))
)
Y의 분포가 daily, hourly에 따라 매우 다름
# daily
(
so.Plot(bikes_daily)
.pair(x= ['casual' , 'registered' ])
.add(so.Bars(), so.Hist())
.facet(row= "year" )
.layout(size= (6 , 5 ))
)
# hourly
(
so.Plot(bikes)
.pair(x= ['casual' , 'registered' ])
.add(so.Bars(), so.Hist())
.facet(row= "year" )
.layout(size= (6 , 5 ))
)
(
so.Plot(bikes_daily, x= 'temp' )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.facet("year" )
.layout(size= (9 , 7 ))
)
# 바람의 세기에 따라 차이가 있는가?
(
so.Plot(bikes_daily, x= 'windspeed' )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.facet("year" )
.layout(size= (9 , 7 ))
)
# 습도에 따라 차이가 있는가?
(
so.Plot(bikes_daily, x= 'hum' )
.pair(y= ['casual' , 'registered' ])
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.facet("year" )
.layout(size= (9 , 7 ))
.limit(x= (0.4 , 1 ))
)
# 기온과 습도의 관계?
(
so.Plot(bikes_daily, x= 'temp' , y= 'hum' )
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
)
Registered bikers
(
so.Plot(bikes_daily, x= 'temp' , y= 'registered' )
.add(so.Dots(color= ".6" ))
.add(so.Line(), so.PolyFit(5 ))
.add(so.Line(color= "red" ), so.PolyFit(2 ))
.add(so.Line(color= "black" ), so.Spline(5 ))
.facet("year" )
)
year
는 예측변수로 의미는 없으나 modeling 방식의 예시를 위해 사용했음.
import statsmodels.formula.api as smf
mod_r1 = smf.ols("registered ~ temp + I(temp**2) + year" , data= bikes_daily).fit()
mod_r2 = smf.ols("registered ~ (temp + I(temp**2)) * year" , data= bikes_daily).fit()
predictions
temp = np.linspace(bikes_daily["temp" ].min (), bikes_daily["temp" ].max (), 30 )
year = np.array(["2011" , "2012" ])
from itertools import product
grid = pd.DataFrame(
list (product(temp, year)),
columns= ["temp" , "year" ],
)
grid["pred_r1" ] = mod_r1.predict(grid)
grid["pred_r2" ] = mod_r2.predict(grid)
(
so.Plot(grid, x= 'temp' , color= "year" )
.pair(y= ['pred_r1' , 'pred_r2' ], wrap= 1 )
.add(so.Dots())
.layout(size= (7 , 4 ))
)
residuals
bikes_daily_resid = bikes_daily.assign(
resid_r1 = mod_r1.resid,
resid_r2 = mod_r2.resid
)
(
so.Plot(bikes_daily_resid, x= 'temp' )
.pair(y= ['resid_r1' , 'resid_r2' ])
.add(so.Dots(color= '.6' ))
.add(so.Line(), so.PolyFit(5 ))
.add(so.Line(color= ".2" ), so.Agg(lambda x: 0 ))
.facet("year" )
.layout(size= (7 , 6 ))
)
온도와 달의 밀접한 관계?
온도로 예측되지 못한 부분을 달이 추가적으로 예측할 수 있는 여지가 있는가?
mod = smf.ols('temp ~ month' , data= bikes_daily).fit()
mod.rsquared
(
boxplot(bikes_daily, x= 'month' , y= 'temp' )
.facet("year" )
.layout(size= (8 , 4 ))
)
(
so.Plot(bikes_daily_resid, x= 'month' , y= 'resid_r2' )
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.facet("year" )
.layout(size= (8 , 5 ))
)
9~11월에는 온도로는 예측되지 않는, 즉 같은 온도라고 해도 자건거를 특별히 더 대여하게 되는 특성이 있을 수 있음.
반대로, 2~4월에는 온도로는 예측되지 않는 자건거를 특별히 더 적게 대여하게 되는 특성이 있을 수 있음.
그 이유를 파악할 수 있다면, term을 만들어 추가할 수 있을 것.
(
so.Plot(bikes_daily_resid, x= 'date' , y= 'resid_r2' )
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 )) # grouping by year
.layout(size= (8 , 5 ))
)
bikes_daily["month" ] = bikes_daily["month" ].astype("category" )
bikes_daily["month" ] = bikes_daily["month" ].cat.set_categories(
["Jan" , "Feb" , "Mar" , "Apr" , "May" , "Jun" , "Jul" , "Aug" , "Sep" , "Oct" , "Nov" , "Dec" ]
)
mod_r3 = smf.ols("registered ~ (temp + I(temp**2)) * year + month" , data= bikes_daily).fit()
display(mod_r3.rsquared, mod_r2.rsquared) # month 추가로 인한 R2가 증가분
print (mod_r3.summary(slim= True ))
OLS Regression Results
==============================================================================
Dep. Variable: registered R-squared: 0.705
Model: OLS Adj. R-squared: 0.699
No. Observations: 731 F-statistic: 106.8
Covariance Type: nonrobust Prob (F-statistic): 5.06e-177
=============================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------
Intercept 180.4223 321.376 0.561 0.575 -450.533 811.377
year[T.2012] -1303.6849 477.877 -2.728 0.007 -2241.898 -365.472
month[T.Feb] 16.8031 163.242 0.103 0.918 -303.688 337.295
month[T.Mar] 70.8817 178.488 0.397 0.691 -279.542 421.306
month[T.Apr] 291.0581 198.194 1.469 0.142 -98.054 680.170
month[T.May] 649.9687 223.994 2.902 0.004 210.202 1089.735
month[T.Jun] 1021.8253 248.544 4.111 0.000 533.861 1509.789
month[T.Jul] 853.7900 277.468 3.077 0.002 309.039 1398.541
month[T.Aug] 985.2750 256.593 3.840 0.000 481.508 1489.041
month[T.Sep] 1085.0183 229.970 4.718 0.000 633.521 1536.516
month[T.Oct] 995.3377 200.849 4.956 0.000 601.012 1389.663
month[T.Nov] 946.3073 178.028 5.315 0.000 596.786 1295.828
month[T.Dec] 568.6507 166.184 3.422 0.001 242.382 894.919
temp 6120.3556 1655.969 3.696 0.000 2869.205 9371.506
temp:year[T.2012] 1.24e+04 2087.372 5.938 0.000 8297.289 1.65e+04
I(temp ** 2) -3876.2985 1731.986 -2.238 0.026 -7276.693 -475.904
I(temp ** 2):year[T.2012] -1.105e+04 2082.530 -5.304 0.000 -1.51e+04 -6958.058
=============================================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
bikes_daily_resid["resid_r3" ] = mod_r3.resid
(
so.Plot(bikes_daily_resid, x= 'date' , y= 'resid_r3' )
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
)
# 예측값과 실제값 비교
bikes_daily_resid["pred_r3" ] = mod_r3.predict(bikes_daily)
(
so.Plot(bikes_daily_resid, x= 'registered' , y= 'pred_r3' )
.add(so.Dots())
.add(so.Line(), y= 'registered' )
.facet("year" )
.layout(size= (8 , 4 ))
)
outliers = bikes_daily_resid.query('resid_r3 < -2000' )[["date" , "holiday" ]]
outliers
date holiday
238 2011-08-27 0
365 2012-01-01 0
448 2012-03-24 0
477 2012-04-22 0
499 2012-05-14 0
517 2012-06-01 0
567 2012-07-21 0
596 2012-08-19 0
610 2012-09-02 0
626 2012-09-18 0
645 2012-10-07 0
667 2012-10-29 0
668 2012-10-30 0
691 2012-11-22 1
692 2012-11-23 0
693 2012-11-24 0
723 2012-12-24 0
724 2012-12-25 1
725 2012-12-26 0
모형의 비교와 각 예측변수의 공헌도
mod_r_year = smf.ols("registered ~ year" , data= bikes_daily).fit()
mod_r_temp = smf.ols("registered ~ year + temp + I(temp**2)" , data= bikes_daily).fit()
mod_r_month = smf.ols("registered ~ year + temp + I(temp**2) + month" , data= bikes_daily).fit()
mod_r_wday = smf.ols("registered ~ year + temp + I(temp**2) + month + wday" , data= bikes_daily).fit()
print (
f" year: { mod_r_year. rsquared:.3f} \n " ,
f"year + temperature: { mod_r_temp. rsquared:.3f} \n " ,
f"year + temperature + months: { mod_r_month. rsquared:.3f} \n " ,
f"year + temperature + months + days of the week: { mod_r_wday. rsquared:.3f} " ,
)
year: 0.353
year + temperature: 0.653
year + temperature + months: 0.686
year + temperature + months + days of the week: 0.757
Casual bikers
(
so.Plot(bikes_daily, x= 'temp' , y= 'casual' )
.add(so.Dots())
.add(so.Line(), so.PolyFit(5 ))
.facet("year" )
)
# log scale for y
(
so.Plot(bikes_daily, x= 'temp' , y= 'casual' )
.add(so.Dots())
.add(so.Line(), so.PolyFit(2 ))
.facet("year" )
.scale(y= "log" ) # log scale
)
# 온도를 5개 구간으로 나누어 각 온도구간에서의 분포를 살펴봄
bikes_daily["temp_cat" ] = pd.qcut(bikes_daily["temp" ], 5 )
(
so.Plot(bikes_daily, x= 'casual' )
.add(so.Bars(), so.Hist())
.facet("temp_cat" )
.layout(size= (8 , 4 ))
)
bikes_daily["temp_cat" ] = pd.qcut(bikes_daily["temp" ], 7 )
bikes_daily.assign(casual2 = lambda x: x.casual / 100 ).groupby("temp_cat" )["casual2" ].agg(["mean" , "var" ])
mean var
temp_cat
(0.0581, 0.281] 1.94 2.54
(0.281, 0.355] 4.33 9.42
(0.355, 0.447] 6.74 38.09
(0.447, 0.544] 10.11 46.18
(0.544, 0.635] 11.71 58.61
(0.635, 0.716] 12.67 42.25
(0.716, 0.862] 11.88 29.46
이렇게 count 데이터가 전형적으로 보이는 “평균에 따라 표준편차가 함께 증가”하는 경우, generalized linear model (GLM)의 하나인 포아송(Poisson) 분포나 이를 보정한 다른 분포를 결합한 모형을 이용하는 것이 권장됨.
Y를 transform하는 것보다 여러 이점이 있는 한편, Y가 내재적으로 log나 sqrt의 의미를 띤다면 transform하는 것이 더 적절할 수도 있음.
bikes_daily["lcasual" ] = np.log2(bikes_daily["casual" ] + 1 )
mod_c = smf.ols("lcasual ~ (temp + I(temp**2)) + year" , data= bikes_daily).fit()
bikes_daily_resid["resid_c" ] = mod_c.resid
bikes_daily_resid["lcasual" ] = np.log2(bikes_daily_resid["casual" ] + 1 )
(
so.Plot(bikes_daily_resid, x= 'date' , y= 'resid_c' )
.add(so.Dots())
.add(so.Line(), so.Spline(10 ))
.limit(y= (- 3 , 2 ))
)
# 요일에 따라 차이가 있는가?
from sbcustom import boxplot
(
boxplot(bikes_daily, x= "wday" , y= "casual" )
.facet("year" )
.layout(size= (8 , 4 ))
)
요일의 차이가 추가적으로 설명할 수 있는 부분이 있는가?
(
so.Plot(bikes_daily_resid, x= 'date' , y= 'resid_c' )
.add(so.Dots())
.add(so.Line(), so.Spline(5 ))
.add(so.Line(color= ".6" ), so.Agg(lambda x: 0 ))
.limit(y= (- 3 , 2 ))
.facet("wday" , wrap= 4 )
.layout(size= (9 , 5 ))
)
(
so.Plot(bikes_daily_resid, x= 'date' , y= 'resid_c' )
.add(so.Dots())
.add(so.Line(), so.Spline(5 ))
.add(so.Line(color= ".6" ), so.Agg(lambda x: 0 ))
.limit(y= (- 3 , 2 ))
.facet("workingday" )
.layout(size= (9 , 5 ))
.label(title= "workingday: {} " .format )
)
요일별로 모두 고려할 것인가? 아니면 평일/주말로 나눌 것인가?
# 요일별로
mod_c2 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday" , data= bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared)
# 평일/주말
mod_c2_1 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + workingday" , data= bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared, mod_c2_1.rsquared)
# 습도 추가
mod_c3 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)" , data= bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared, mod_c3.rsquared)
mod_c3_robust = sm.RLM.from_formula("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)" , data= bikes_daily).fit()
마지막 모형(mod_c2)의 예측값
smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily)
predictions
temp = np.linspace(bikes_daily["temp" ].min (), bikes_daily["temp" ].max (), 30 )
hum = np.array([bikes_daily["hum" ].mean()])
year = np.array(["2011" , "2012" ])
wday = np.array(["Sun" , "Mon" , "Tue" , "Wed" , "Thu" , "Fri" , "Sat" ])
from itertools import product
grid = pd.DataFrame(
list (product(temp, year, wday, hum)),
columns= ["temp" , "year" , "wday" , "hum" ],
)
grid["log_pred_c" ] = mod_c3.predict(grid)
(
so.Plot(grid, x= 'temp' , y= 'log_pred_c' , color= "year" )
.add(so.Dots())
.facet("wday" , wrap= 4 )
.label(title= "Pred of mod_c2 {} " .format )
.layout(size= (9 , 5 ))
)
Original scale로 예측값
predictions with the original scale
temp = np.linspace(bikes_daily["temp" ].min (), bikes_daily["temp" ].max (), 30 )
hum = np.array([bikes_daily["hum" ].mean()])
year = np.array(["2011" , "2012" ])
wday = np.array(["Sun" , "Mon" , "Tue" , "Wed" , "Thu" , "Fri" , "Sat" ])
from itertools import product
grid = pd.DataFrame(
list (product(temp, year, wday, hum)),
columns= ["temp" , "year" , "wday" , "hum" ],
)
grid["pred_c" ] = 2 ** mod_c3.predict(grid) - 1
(
so.Plot(grid, x= 'temp' , y= 'pred_c' , color= "year" )
.add(so.Dots())
.facet("wday" , wrap= 4 )
.label(title= "Pred of mod_c2 {} " .format )
.layout(size= (9 , 5 ))
)
잔차 확인
bikes_daily_resid["resid_c2" ] = mod_c2.resid
bikes_daily_resid["resid_c3" ] = mod_c3.resid
residual plot for mod_c2; 습도 추가 전
(
so.Plot(bikes_daily_resid, x= 'temp' , y= 'resid_c2' )
.add(so.Dots(color= '.6' ))
.add(so.Line(), so.Spline(4 ))
.add(so.Line(color= ".2" ), so.Agg(lambda x: 0 ))
.facet("year" )
.layout(size= (7 , 5 ))
.limit(y= (- 3 , 2 ))
)
residual plot for mod_c3; 습도까지 포함
(
so.Plot(bikes_daily_resid, x= 'temp' , y= 'resid_c3' )
.add(so.Dots(color= '.6' ))
.add(so.Line(), so.Spline(4 ))
.add(so.Line(color= ".2" ), so.Agg(lambda x: 0 ))
.facet("year" )
.layout(size= (7 , 5 ))
.limit(y= (- 3 , 2 ))
)
residual with the original scale
bikes_daily_resid = bikes_daily.assign(
pred_c3_revert = lambda x: 2 ** mod_c3.predict(x) - 1 ,
resid_c3_revert = lambda x: x["casual" ] - x["pred_c3_revert" ]
)
(
so.Plot(bikes_daily_resid, x= 'day' , y= 'resid_c3_revert' )
.add(so.Dots(color= '.6' ))
.add(so.Line(), so.PolyFit(5 ))
.add(so.Line(color= ".2" ), so.Agg(lambda x: 0 ))
.facet("year" )
.layout(size= (7 , 5 ))
.limit(y= (- 1000 , 1000 ))
)
Poissong regresion residuals
# poisson regression
mod_c3_pois = smf.poisson("casual ~ (temp + I(temp**2)) + year + wday + bs(hum, 5)" , data= bikes_daily).fit()
bikes_daily_resid["resid_c3_pois" ] = mod_c3_pois.resid
bikes_daily_resid["pred_c3_pois" ] = mod_c3_pois.predict(bikes_daily)
(
so.Plot(bikes_daily_resid, x= 'day' , y= 'resid_c3_pois' )
.add(so.Dots(color= '.6' ))
.add(so.Line(), so.PolyFit(5 ))
.add(so.Line(color= ".2" ), so.Agg(lambda x: 0 ))
.facet("year" )
.layout(size= (7 , 5 ))
.limit(y= (- 1000 , 1000 ))
)
Optimization terminated successfully.
Current function value: 49.152525
Iterations 7
# 예측값과 실제값 비교
(
so.Plot(bikes_daily_resid, x= 'casual' , y= 'pred_c3_revert' )
.add(so.Dots())
.add(so.Line(color= ".3" ), y= 'casual' )
.facet("year" )
)
# 예측값과 실제값 비교: Poisson regression
(
so.Plot(bikes_daily_resid, x= 'casual' , y= 'pred_c3_pois' )
.add(so.Dots())
.add(so.Line(color= ".3" ), y= 'casual' )
.facet("year" )
)
모형의 비교와 각 예측변수의 공헌도
mod_c_year = smf.ols("lcasual ~ year" , data= bikes_daily).fit()
mod_c_temp = smf.ols("lcasual ~ year + temp + I(temp**2)" , data= bikes_daily).fit()
mod_c_month = smf.ols(
"lcasual ~ year + temp + I(temp**2) + month" , data= bikes_daily
).fit()
mod_c_hum = smf.ols(
"lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4)" , data= bikes_daily
).fit()
mod_c_wday = smf.ols(
"lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4) + wday" , data= bikes_daily
).fit()
print (
f" year: { mod_c_year. rsquared:.3f} \n " ,
f"year + temperature: { mod_c_temp. rsquared:.3f} \n " ,
f"year + temperature + months: { mod_c_month. rsquared:.3f} \n " ,
f"year + temperature + months + humidity: { mod_c_hum. rsquared:.3f} \n " ,
f"year + temperature + months + days of the week: { mod_c_wday. rsquared:.3f} " ,
)
year: 0.057
year + temperature: 0.554
year + temperature + months: 0.588
year + temperature + months + humidity: 0.660
year + temperature + months + days of the week: 0.824
예측값을 original scale로 다시 되돌려 R squared 계산
R-squared with the original scale
bikes_daily_resid2 = bikes_daily.assign(
pred_c_year = lambda x: 2 ** mod_c_year.fittedvalues - 1 ,
pred_c_temp = lambda x: 2 ** mod_c_temp.fittedvalues - 1 ,
pred_c_month = lambda x: 2 ** mod_c_month.fittedvalues - 1 ,
pred_c_hum = lambda x: 2 ** mod_c_hum.fittedvalues - 1 ,
pred_c_wday = lambda x: 2 ** mod_c_wday.fittedvalues - 1 ,
)
from sklearn.metrics import r2_score
for col in ["pred_c_year" , "pred_c_temp" , "pred_c_month" , "pred_c_hum" , "pred_c_wday" ]:
r2 = r2_score(bikes_daily_resid2["casual" ], bikes_daily_resid2[col])
print (f" { col} : { r2:.2f} " )
pred_c_year: -0.08
pred_c_temp: 0.34
pred_c_month: 0.37
pred_c_hum: 0.42
pred_c_wday: 0.80
포아송 회귀 로 살펴보면,
R-squared가 존재하지 않고, 대신 pseudo R-squared가 있음; 절편만 있는 모형 대비 log-likelihood가 얼마나 늘었는지의 비율로 구현
X의 값에 따라 Y의 분포가 달라지는 경우, R-squared는 의미가 없음
데이터를 train/test로 나누어 평가하는 것이 더 적절함
mod_c2_pois = smf.poisson("casual ~ (temp + I(temp**2)) + year + wday" , data= bikes_daily).fit()
mod_c_pois_year = smf.poisson("casual ~ year" , data= bikes_daily).fit()
mod_c_pois_temp = smf.poisson("casual ~ year + temp + I(temp**2)" , data= bikes_daily).fit()
mod_c_pois_month = smf.poisson(
"casual ~ year + temp + I(temp**2) + month" , data= bikes_daily
).fit()
mod_c_pois_hum = smf.ols(
"lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4)" , data= bikes_daily
).fit()
mod_c_pois_wday = smf.poisson(
"casual ~ year + temp + I(temp**2) + month + bs(hum, 4) + wday" , data= bikes_daily
).fit()
print (
f" year: { mod_c_pois_year. prsquared:.3f} \n " , # prsquared: pseudo R-squared
f"year + temperature: { mod_c_pois_temp. prsquared:.3f} \n " ,
f"year + temperature + months: { mod_c_pois_month. prsquared:.3f} \n " ,
f"year + temperature + months + humidity: { mod_c_pois_hum. rsquared:.3f} \n " ,
f"year + temperature + months + days of the week: { mod_c_pois_wday. prsquared:.3f} " ,
)
Optimization terminated successfully.
Current function value: 60.868155
Iterations 7
Optimization terminated successfully.
Current function value: 245.676778
Iterations 5
Optimization terminated successfully.
Current function value: 140.141516
Iterations 6
Optimization terminated successfully.
Current function value: 130.926793
Iterations 7
Optimization terminated successfully.
Current function value: 43.674208
Iterations 7
year: 0.066
year + temperature: 0.467
year + temperature + months: 0.502
year + temperature + months + humidity: 0.660
year + temperature + months + days of the week: 0.834