Model Buidling 2 Exercises

Mixed

Author

Sungkyun Cho

Published

June 10, 2025

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm
import statsmodels.formula.api as smf

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

Bike Sharing Demand

Bike Sharing in Washington D.C. Dataset

bikeshare = pd.read_csv("../data/hour.csv")
bikeshare_daily = pd.read_csv("../data/day.csv")


def clean_data(df):
    df.rename({"dteday": "date", "cnt": "count"}, axis=1, inplace=True)

    df = df.assign(
        date=lambda x: pd.to_datetime(x["date"]),  # datetime type으로 변환
        year=lambda x: x["date"].dt.year.astype(str),  # year 추출
        day=lambda x: x["date"].dt.day_of_year,  # day of the year 추출
        month=lambda x: x["date"].dt.month_name().str[:3],  # month 추출
        wday=lambda x: x["date"].dt.day_name().str[:3],  # 요일 추출
    )

    df["season"] = (
        df["season"]
        .map({1: "winter", 2: "spring", 3: "summer", 4: "fall"})  # season을 문자열로 변환
        .astype("category")  # category type으로 변환
        .cat.set_categories(
            ["winter", "spring", "summer", "fall"], ordered=True
        )  # 순서를 지정
    )
    return df


bikes = clean_data(bikeshare)
bikes_daily = clean_data(bikeshare_daily)
bikes_daily.head(3)
   instant       date  season  yr  mnth  holiday  weekday  workingday  \
0        1 2011-01-01  winter   0     1        0        6           0   
1        2 2011-01-02  winter   0     1        0        0           0   
2        3 2011-01-03  winter   0     1        0        1           1   

   weathersit  temp  atemp  hum  windspeed  casual  registered  count  year  \
0           2  0.34   0.36 0.81       0.16     331         654    985  2011   
1           2  0.36   0.35 0.70       0.25     131         670    801  2011   
2           1  0.20   0.19 0.44       0.25     120        1229   1349  2011   

   day month wday  
0    1   Jan  Sat  
1    2   Jan  Sun  
2    3   Jan  Mon  
(
    so.Plot(bikes_daily, x='date')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .layout(size=(8, 6))
)

(
    so.Plot(bikes_daily, x='date')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.Spline(10))
    .layout(size=(8, 6))
)

(
    so.Plot(bikes_daily, x='day', color="year")
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .layout(size=(7, 7))
)

(
    so.Plot(bikes_daily, x='day', color="year")
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.Spline(5))
    .layout(size=(7, 7))
)

(
    so.Plot(bikes_daily, x='day', y='temp', color="year")
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
)

# 요일에 따라 차이가 있는가?
from sbcustom import boxplot
(
    boxplot(bikes_daily, x="wday", y="casual")
    .facet("year")
    .layout(size=(8, 4))
)

(
    boxplot(bikes_daily, x="wday", y="registered")
    .facet("year")
    .layout(size=(8, 4))
)

# 달에 따라 차이가 있는가?
(
    boxplot(bikes_daily, x="month", y="casual")
    .facet("year")
    .layout(size=(8, 4))
)

(
    boxplot(bikes_daily, x="month", y="registered")
    .facet("year")
    .layout(size=(8, 4))
)

# 기온에 따라 차이가 있는가?
(
    so.Plot(bikes_daily, x='temp')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
)

Y의 분포가 daily, hourly에 따라 매우 다름

# daily
(
    so.Plot(bikes_daily)
    .pair(x=['casual', 'registered'])
    .add(so.Bars(), so.Hist())
    .facet(row="year")
    .layout(size=(6, 5))
)

# hourly
(
    so.Plot(bikes)
    .pair(x=['casual', 'registered'])
    .add(so.Bars(), so.Hist())
    .facet(row="year")
    .layout(size=(6, 5))
)

(
    so.Plot(bikes_daily, x='temp')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
)

# 바람의 세기에 따라 차이가 있는가?
(
    so.Plot(bikes_daily, x='windspeed')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
)

# 습도에 따라 차이가 있는가?
(
    so.Plot(bikes_daily, x='hum')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
    .limit(x=(0.4, 1))
)

# 기온과 습도의 관계?
(
    so.Plot(bikes_daily, x='temp', y='hum')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
)

Registered bikers

(
    so.Plot(bikes_daily, x='temp', y='registered')
    .add(so.Dots(color=".6"))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color="red"), so.PolyFit(2))
    .add(so.Line(color="black"), so.Spline(5))
    .facet("year")
)

year는 예측변수로 의미는 없으나 modeling 방식의 예시를 위해 사용했음.

import statsmodels.formula.api as smf
mod_r1 = smf.ols("registered ~ temp + I(temp**2) + year", data=bikes_daily).fit()
mod_r2 = smf.ols("registered ~ (temp + I(temp**2)) * year", data=bikes_daily).fit()
predictions
temp = np.linspace(bikes_daily["temp"].min(), bikes_daily["temp"].max(), 30)
year = np.array(["2011", "2012"])

from itertools import product
grid = pd.DataFrame(
    list(product(temp, year)),
    columns=["temp", "year"],
)
grid["pred_r1"] = mod_r1.predict(grid)
grid["pred_r2"] = mod_r2.predict(grid)

(
    so.Plot(grid, x='temp', color="year")
    .pair(y=['pred_r1', 'pred_r2'], wrap=1)
    .add(so.Dots())
    .layout(size=(7, 4))
)

residuals
bikes_daily_resid = bikes_daily.assign(
    resid_r1 = mod_r1.resid,
    resid_r2 = mod_r2.resid
)

(
    so.Plot(bikes_daily_resid, x='temp')
    .pair(y=['resid_r1', 'resid_r2'])
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 6))
)

온도와 달의 밀접한 관계?
온도로 예측되지 못한 부분을 달이 추가적으로 예측할 수 있는 여지가 있는가?

mod = smf.ols('temp ~ month', data=bikes_daily).fit()
mod.rsquared
0.8378460473171662
(
    boxplot(bikes_daily, x='month', y='temp')
    .facet("year")
    .layout(size=(8, 4))
)

(
    so.Plot(bikes_daily_resid, x='month', y='resid_r2')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(8, 5))
)

9~11월에는 온도로는 예측되지 않는, 즉 같은 온도라고 해도 자건거를 특별히 더 대여하게 되는 특성이 있을 수 있음.
반대로, 2~4월에는 온도로는 예측되지 않는 자건거를 특별히 더 적게 대여하게 되는 특성이 있을 수 있음.
그 이유를 파악할 수 있다면, term을 만들어 추가할 수 있을 것.

(
    so.Plot(bikes_daily_resid, x='date', y='resid_r2')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5)) # grouping by year
    .layout(size=(8, 5))
)

bikes_daily["month"] = bikes_daily["month"].astype("category")
bikes_daily["month"] = bikes_daily["month"].cat.set_categories(
    ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
)
mod_r3 = smf.ols("registered ~ (temp + I(temp**2)) * year + month", data=bikes_daily).fit()
display(mod_r3.rsquared, mod_r2.rsquared)  # month 추가로 인한 R2가 증가분
0.7053133858977505
0.6681657121983411
print(mod_r3.summary(slim=True))
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             registered   R-squared:                       0.705
Model:                            OLS   Adj. R-squared:                  0.699
No. Observations:                 731   F-statistic:                     106.8
Covariance Type:            nonrobust   Prob (F-statistic):          5.06e-177
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                   180.4223    321.376      0.561      0.575    -450.533     811.377
year[T.2012]              -1303.6849    477.877     -2.728      0.007   -2241.898    -365.472
month[T.Feb]                 16.8031    163.242      0.103      0.918    -303.688     337.295
month[T.Mar]                 70.8817    178.488      0.397      0.691    -279.542     421.306
month[T.Apr]                291.0581    198.194      1.469      0.142     -98.054     680.170
month[T.May]                649.9687    223.994      2.902      0.004     210.202    1089.735
month[T.Jun]               1021.8253    248.544      4.111      0.000     533.861    1509.789
month[T.Jul]                853.7900    277.468      3.077      0.002     309.039    1398.541
month[T.Aug]                985.2750    256.593      3.840      0.000     481.508    1489.041
month[T.Sep]               1085.0183    229.970      4.718      0.000     633.521    1536.516
month[T.Oct]                995.3377    200.849      4.956      0.000     601.012    1389.663
month[T.Nov]                946.3073    178.028      5.315      0.000     596.786    1295.828
month[T.Dec]                568.6507    166.184      3.422      0.001     242.382     894.919
temp                       6120.3556   1655.969      3.696      0.000    2869.205    9371.506
temp:year[T.2012]           1.24e+04   2087.372      5.938      0.000    8297.289    1.65e+04
I(temp ** 2)              -3876.2985   1731.986     -2.238      0.026   -7276.693    -475.904
I(temp ** 2):year[T.2012] -1.105e+04   2082.530     -5.304      0.000   -1.51e+04   -6958.058
=============================================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
bikes_daily_resid["resid_r3"] = mod_r3.resid
(
    so.Plot(bikes_daily_resid, x='date', y='resid_r3')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
)

# 예측값과 실제값 비교
bikes_daily_resid["pred_r3"] = mod_r3.predict(bikes_daily)
(
    so.Plot(bikes_daily_resid, x='registered', y='pred_r3')
    .add(so.Dots())
    .add(so.Line(), y='registered')
    .facet("year")
    .layout(size=(8, 4))
)

outliers = bikes_daily_resid.query('resid_r3 < -2000')[["date", "holiday"]]
outliers
          date  holiday
238 2011-08-27        0
365 2012-01-01        0
448 2012-03-24        0
477 2012-04-22        0
499 2012-05-14        0
517 2012-06-01        0
567 2012-07-21        0
596 2012-08-19        0
610 2012-09-02        0
626 2012-09-18        0
645 2012-10-07        0
667 2012-10-29        0
668 2012-10-30        0
691 2012-11-22        1
692 2012-11-23        0
693 2012-11-24        0
723 2012-12-24        0
724 2012-12-25        1
725 2012-12-26        0

모형의 비교와 각 예측변수의 공헌도

mod_r_year = smf.ols("registered ~ year", data=bikes_daily).fit()
mod_r_temp = smf.ols("registered ~ year + temp + I(temp**2)", data=bikes_daily).fit()
mod_r_month = smf.ols("registered ~ year + temp + I(temp**2) + month", data=bikes_daily).fit()
mod_r_wday = smf.ols("registered ~ year + temp + I(temp**2) + month + wday", data=bikes_daily).fit()

print(
    f" year: {mod_r_year.rsquared:.3f}\n",
    f"year + temperature: {mod_r_temp.rsquared:.3f}\n",
    f"year + temperature + months: {mod_r_month.rsquared:.3f}\n",
    f"year + temperature + months + days of the week: {mod_r_wday.rsquared:.3f}",
)
 year: 0.353
 year + temperature: 0.653
 year + temperature + months: 0.686
 year + temperature + months + days of the week: 0.757

Casual bikers

(
    so.Plot(bikes_daily, x='temp', y='casual')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
)

# log scale for y
(
    so.Plot(bikes_daily, x='temp', y='casual')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(2))
    .facet("year")
    .scale(y="log")  # log scale
)

# 온도를 5개 구간으로 나누어 각 온도구간에서의 분포를 살펴봄
bikes_daily["temp_cat"] = pd.qcut(bikes_daily["temp"], 5)

(
    so.Plot(bikes_daily, x='casual')
    .add(so.Bars(), so.Hist())
    .facet("temp_cat")
    .layout(size=(8, 4))
)

bikes_daily["temp_cat"] = pd.qcut(bikes_daily["temp"], 7)

bikes_daily.assign(casual2 = lambda x: x.casual / 100).groupby("temp_cat")["casual2"].agg(["mean", "var"])
                 mean   var
temp_cat                   
(0.0581, 0.281]  1.94  2.54
(0.281, 0.355]   4.33  9.42
(0.355, 0.447]   6.74 38.09
(0.447, 0.544]  10.11 46.18
(0.544, 0.635]  11.71 58.61
(0.635, 0.716]  12.67 42.25
(0.716, 0.862]  11.88 29.46
  • 이렇게 count 데이터가 전형적으로 보이는 “평균에 따라 표준편차가 함께 증가”하는 경우, generalized linear model (GLM)의 하나인 포아송(Poisson) 분포나 이를 보정한 다른 분포를 결합한 모형을 이용하는 것이 권장됨.

  • Y를 transform하는 것보다 여러 이점이 있는 한편, Y가 내재적으로 log나 sqrt의 의미를 띤다면 transform하는 것이 더 적절할 수도 있음.

bikes_daily["lcasual"] = np.log2(bikes_daily["casual"] + 1)
mod_c = smf.ols("lcasual ~ (temp + I(temp**2)) + year", data=bikes_daily).fit()
bikes_daily_resid["resid_c"] = mod_c.resid
bikes_daily_resid["lcasual"] = np.log2(bikes_daily_resid["casual"] + 1)
(
    so.Plot(bikes_daily_resid, x='date', y='resid_c')
    .add(so.Dots())
    .add(so.Line(), so.Spline(10))
    .limit(y=(-3, 2))
)

# 요일에 따라 차이가 있는가?
from sbcustom import boxplot
(
    boxplot(bikes_daily, x="wday", y="casual")
    .facet("year")
    .layout(size=(8, 4))
)

요일의 차이가 추가적으로 설명할 수 있는 부분이 있는가?

(
    so.Plot(bikes_daily_resid, x='date', y='resid_c')
    .add(so.Dots())
    .add(so.Line(), so.Spline(5))
    .add(so.Line(color=".6"), so.Agg(lambda x: 0))
    .limit(y=(-3, 2))
    .facet("wday", wrap=4)
    .layout(size=(9, 5))
)

(
    so.Plot(bikes_daily_resid, x='date', y='resid_c')
    .add(so.Dots())
    .add(so.Line(), so.Spline(5))
    .add(so.Line(color=".6"), so.Agg(lambda x: 0))
    .limit(y=(-3, 2))
    .facet("workingday")
    .layout(size=(9, 5))
    .label(title="workingday: {}".format)
)

요일별로 모두 고려할 것인가? 아니면 평일/주말로 나눌 것인가?

# 요일별로
mod_c2 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday", data=bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared)
0.5540585294121951
0.7283361326294607
# 평일/주말
mod_c2_1 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + workingday", data=bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared, mod_c2_1.rsquared)
0.5540585294121951
0.7283361326294607
0.7242260371575533
# 습도 추가
mod_c3 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared, mod_c3.rsquared)
0.5540585294121951
0.7283361326294607
0.8008034311536445
mod_c3_robust = sm.RLM.from_formula("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily).fit()

마지막 모형(mod_c2)의 예측값
smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily)

predictions
temp = np.linspace(bikes_daily["temp"].min(), bikes_daily["temp"].max(), 30)
hum = np.array([bikes_daily["hum"].mean()])
year = np.array(["2011", "2012"])
wday = np.array(["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])

from itertools import product
grid = pd.DataFrame(
    list(product(temp, year, wday, hum)),
    columns=["temp", "year", "wday", "hum"],
)
grid["log_pred_c"] = mod_c3.predict(grid)

(
    so.Plot(grid, x='temp', y='log_pred_c', color="year")
    .add(so.Dots())
    .facet("wday", wrap=4)
    .label(title="Pred of mod_c2 {}".format)
    .layout(size=(9, 5))
)

Original scale로 예측값

predictions with the original scale
temp = np.linspace(bikes_daily["temp"].min(), bikes_daily["temp"].max(), 30)
hum = np.array([bikes_daily["hum"].mean()])
year = np.array(["2011", "2012"])
wday = np.array(["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])

from itertools import product
grid = pd.DataFrame(
    list(product(temp, year, wday, hum)),
    columns=["temp", "year", "wday", "hum"],
)
grid["pred_c"] = 2**mod_c3.predict(grid) - 1

(
    so.Plot(grid, x='temp', y='pred_c', color="year")
    .add(so.Dots())
    .facet("wday", wrap=4)
    .label(title="Pred of mod_c2 {}".format)
    .layout(size=(9, 5))
)

잔차 확인

bikes_daily_resid["resid_c2"] = mod_c2.resid
bikes_daily_resid["resid_c3"] = mod_c3.resid
residual plot for mod_c2; 습도 추가 전
(
    so.Plot(bikes_daily_resid, x='temp', y='resid_c2')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.Spline(4))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-3, 2))
)

residual plot for mod_c3; 습도까지 포함
(
    so.Plot(bikes_daily_resid, x='temp', y='resid_c3')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.Spline(4))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-3, 2))
)

residual with the original scale
bikes_daily_resid = bikes_daily.assign(
    pred_c3_revert = lambda x: 2**mod_c3.predict(x) - 1,
    resid_c3_revert = lambda x: x["casual"] - x["pred_c3_revert"]
)

(
    so.Plot(bikes_daily_resid, x='day', y='resid_c3_revert')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-1000, 1000))
)

Poissong regresion residuals
# poisson regression
mod_c3_pois = smf.poisson("casual ~ (temp + I(temp**2)) + year + wday + bs(hum, 5)", data=bikes_daily).fit()
bikes_daily_resid["resid_c3_pois"] = mod_c3_pois.resid
bikes_daily_resid["pred_c3_pois"] = mod_c3_pois.predict(bikes_daily)
(
    so.Plot(bikes_daily_resid, x='day', y='resid_c3_pois')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-1000, 1000))
)
Optimization terminated successfully.
         Current function value: 49.152525
         Iterations 7

# 예측값과 실제값 비교
(
    so.Plot(bikes_daily_resid, x='casual', y='pred_c3_revert')
    .add(so.Dots())
    .add(so.Line(color=".3"), y='casual')
    .facet("year")
)

# 예측값과 실제값 비교: Poisson regression
(
    so.Plot(bikes_daily_resid, x='casual', y='pred_c3_pois')
    .add(so.Dots())
    .add(so.Line(color=".3"), y='casual')
    .facet("year")
)

모형의 비교와 각 예측변수의 공헌도

mod_c_year = smf.ols("lcasual ~ year", data=bikes_daily).fit()
mod_c_temp = smf.ols("lcasual ~ year + temp + I(temp**2)", data=bikes_daily).fit()
mod_c_month = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month", data=bikes_daily
).fit()
mod_c_hum = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4)", data=bikes_daily
).fit()
mod_c_wday = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4) + wday", data=bikes_daily
).fit()

print(
    f" year: {mod_c_year.rsquared:.3f}\n",
    f"year + temperature: {mod_c_temp.rsquared:.3f}\n",
    f"year + temperature + months: {mod_c_month.rsquared:.3f}\n",
    f"year + temperature + months + humidity: {mod_c_hum.rsquared:.3f}\n",
    f"year + temperature + months + days of the week: {mod_c_wday.rsquared:.3f}",
)
 year: 0.057
 year + temperature: 0.554
 year + temperature + months: 0.588
 year + temperature + months + humidity: 0.660
 year + temperature + months + days of the week: 0.824

예측값을 original scale로 다시 되돌려 R squared 계산

R-squared with the original scale
bikes_daily_resid2 = bikes_daily.assign(
    pred_c_year = lambda x: 2**mod_c_year.fittedvalues - 1,
    pred_c_temp = lambda x: 2**mod_c_temp.fittedvalues - 1,
    pred_c_month = lambda x: 2**mod_c_month.fittedvalues - 1,
    pred_c_hum = lambda x: 2**mod_c_hum.fittedvalues - 1,
    pred_c_wday = lambda x: 2**mod_c_wday.fittedvalues - 1,
)

from sklearn.metrics import r2_score

for col in ["pred_c_year", "pred_c_temp", "pred_c_month", "pred_c_hum",  "pred_c_wday"]:
    r2 = r2_score(bikes_daily_resid2["casual"], bikes_daily_resid2[col])
    print(f"{col}: {r2:.2f}")
pred_c_year: -0.08
pred_c_temp: 0.34
pred_c_month: 0.37
pred_c_hum: 0.42
pred_c_wday: 0.80

포아송 회귀로 살펴보면,
R-squared가 존재하지 않고, 대신 pseudo R-squared가 있음; 절편만 있는 모형 대비 log-likelihood가 얼마나 늘었는지의 비율로 구현

  • X의 값에 따라 Y의 분포가 달라지는 경우, R-squared는 의미가 없음
  • 데이터를 train/test로 나누어 평가하는 것이 더 적절함

mod_c2_pois = smf.poisson("casual ~ (temp + I(temp**2)) + year + wday", data=bikes_daily).fit()

mod_c_pois_year = smf.poisson("casual ~ year", data=bikes_daily).fit()
mod_c_pois_temp = smf.poisson("casual ~ year + temp + I(temp**2)", data=bikes_daily).fit()
mod_c_pois_month = smf.poisson(
    "casual ~ year + temp + I(temp**2) + month", data=bikes_daily
).fit()
mod_c_pois_hum = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4)", data=bikes_daily
).fit()
mod_c_pois_wday = smf.poisson(
    "casual ~ year + temp + I(temp**2) + month + bs(hum, 4) + wday", data=bikes_daily
).fit()

print(
    f" year: {mod_c_pois_year.prsquared:.3f}\n",  # prsquared: pseudo R-squared
    f"year + temperature: {mod_c_pois_temp.prsquared:.3f}\n",
    f"year + temperature + months: {mod_c_pois_month.prsquared:.3f}\n",
    f"year + temperature + months + humidity: {mod_c_pois_hum.rsquared:.3f}\n",
    f"year + temperature + months + days of the week: {mod_c_pois_wday.prsquared:.3f}",
)
Optimization terminated successfully.
         Current function value: 60.868155
         Iterations 7
Optimization terminated successfully.
         Current function value: 245.676778
         Iterations 5
Optimization terminated successfully.
         Current function value: 140.141516
         Iterations 6
Optimization terminated successfully.
         Current function value: 130.926793
         Iterations 7
Optimization terminated successfully.
         Current function value: 43.674208
         Iterations 7
 year: 0.066
 year + temperature: 0.467
 year + temperature + months: 0.502
 year + temperature + months + humidity: 0.660
 year + temperature + months + days of the week: 0.834