Model Buidling 2 Exercises

Mixed

Author

Sungkyun Cho

Published

June 10, 2025

Load packages

# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm
import statsmodels.formula.api as smf

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

Bike Sharing Demand

Bike Sharing in Washington D.C. Dataset

bikeshare = pd.read_csv("../data/hour.csv")
bikeshare_daily = pd.read_csv("../data/day.csv")


def clean_data(df):
    df.rename({"dteday": "date", "cnt": "count"}, axis=1, inplace=True)

    df = df.assign(
        date=lambda x: pd.to_datetime(x["date"]),  # datetime type으로 변환
        year=lambda x: x["date"].dt.year.astype(str),  # year 추출
        day=lambda x: x["date"].dt.day_of_year,  # day of the year 추출
        month=lambda x: x["date"].dt.month_name().str[:3],  # month 추출
        wday=lambda x: x["date"].dt.day_name().str[:3],  # 요일 추출
    )

    df["season"] = (
        df["season"]
        .map({1: "winter", 2: "spring", 3: "summer", 4: "fall"})  # season을 문자열로 변환
        .astype("category")  # category type으로 변환
        .cat.set_categories(
            ["winter", "spring", "summer", "fall"], ordered=True
        )  # 순서를 지정
    )
    return df


bikes = clean_data(bikeshare)
bikes_daily = clean_data(bikeshare_daily)

bikes_daily.head(3)

   instant       date  season  yr  mnth  holiday  weekday  workingday  \
0        1 2011-01-01  winter   0     1        0        6           0   
1        2 2011-01-02  winter   0     1        0        0           0   
2        3 2011-01-03  winter   0     1        0        1           1   

   weathersit  temp  atemp  hum  windspeed  casual  registered  count  year  \
0           2  0.34   0.36 0.81       0.16     331         654    985  2011   
1           2  0.36   0.35 0.70       0.25     131         670    801  2011   
2           1  0.20   0.19 0.44       0.25     120        1229   1349  2011   

   day month wday  
0    1   Jan  Sat  
1    2   Jan  Sun  
2    3   Jan  Mon

(
    so.Plot(bikes_daily, x='date')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .layout(size=(8, 6))
)

(
    so.Plot(bikes_daily, x='date')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.Spline(10))
    .layout(size=(8, 6))
)

(
    so.Plot(bikes_daily, x='day', color="year")
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .layout(size=(7, 7))
)

(
    so.Plot(bikes_daily, x='day', color="year")
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.Spline(5))
    .layout(size=(7, 7))
)

(
    so.Plot(bikes_daily, x='day', y='temp', color="year")
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
)

# 요일에 따라 차이가 있는가?
from sbcustom import boxplot
(
    boxplot(bikes_daily, x="wday", y="casual")
    .facet("year")
    .layout(size=(8, 4))
)

(
    boxplot(bikes_daily, x="wday", y="registered")
    .facet("year")
    .layout(size=(8, 4))
)

# 달에 따라 차이가 있는가?
(
    boxplot(bikes_daily, x="month", y="casual")
    .facet("year")
    .layout(size=(8, 4))
)

(
    boxplot(bikes_daily, x="month", y="registered")
    .facet("year")
    .layout(size=(8, 4))
)

# 기온에 따라 차이가 있는가?
(
    so.Plot(bikes_daily, x='temp')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
)

Y의 분포가 daily, hourly에 따라 매우 다름

# daily
(
    so.Plot(bikes_daily)
    .pair(x=['casual', 'registered'])
    .add(so.Bars(), so.Hist())
    .facet(row="year")
    .layout(size=(6, 5))
)

# hourly
(
    so.Plot(bikes)
    .pair(x=['casual', 'registered'])
    .add(so.Bars(), so.Hist())
    .facet(row="year")
    .layout(size=(6, 5))
)

(
    so.Plot(bikes_daily, x='temp')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
)

# 바람의 세기에 따라 차이가 있는가?
(
    so.Plot(bikes_daily, x='windspeed')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
)

# 습도에 따라 차이가 있는가?
(
    so.Plot(bikes_daily, x='hum')
    .pair(y=['casual', 'registered'])
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(9, 7))
    .limit(x=(0.4, 1))
)

# 기온과 습도의 관계?
(
    so.Plot(bikes_daily, x='temp', y='hum')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
)

Registered bikers

(
    so.Plot(bikes_daily, x='temp', y='registered')
    .add(so.Dots(color=".6"))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color="red"), so.PolyFit(2))
    .add(so.Line(color="black"), so.Spline(5))
    .facet("year")
)

year는 예측변수로 의미는 없으나 modeling 방식의 예시를 위해 사용했음.

import statsmodels.formula.api as smf
mod_r1 = smf.ols("registered ~ temp + I(temp**2) + year", data=bikes_daily).fit()
mod_r2 = smf.ols("registered ~ (temp + I(temp**2)) * year", data=bikes_daily).fit()

predictions

temp = np.linspace(bikes_daily["temp"].min(), bikes_daily["temp"].max(), 30)
year = np.array(["2011", "2012"])

from itertools import product
grid = pd.DataFrame(
    list(product(temp, year)),
    columns=["temp", "year"],
)
grid["pred_r1"] = mod_r1.predict(grid)
grid["pred_r2"] = mod_r2.predict(grid)

(
    so.Plot(grid, x='temp', color="year")
    .pair(y=['pred_r1', 'pred_r2'], wrap=1)
    .add(so.Dots())
    .layout(size=(7, 4))
)

residuals

bikes_daily_resid = bikes_daily.assign(
    resid_r1 = mod_r1.resid,
    resid_r2 = mod_r2.resid
)

(
    so.Plot(bikes_daily_resid, x='temp')
    .pair(y=['resid_r1', 'resid_r2'])
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 6))
)

온도와 달의 밀접한 관계?
온도로 예측되지 못한 부분을 달이 추가적으로 예측할 수 있는 여지가 있는가?

mod = smf.ols('temp ~ month', data=bikes_daily).fit()
mod.rsquared

0.8378460473171662

(
    boxplot(bikes_daily, x='month', y='temp')
    .facet("year")
    .layout(size=(8, 4))
)

(
    so.Plot(bikes_daily_resid, x='month', y='resid_r2')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
    .layout(size=(8, 5))
)

9~11월에는 온도로는 예측되지 않는, 즉 같은 온도라고 해도 자건거를 특별히 더 대여하게 되는 특성이 있을 수 있음.
반대로, 2~4월에는 온도로는 예측되지 않는 자건거를 특별히 더 적게 대여하게 되는 특성이 있을 수 있음.
그 이유를 파악할 수 있다면, term을 만들어 추가할 수 있을 것.

(
    so.Plot(bikes_daily_resid, x='date', y='resid_r2')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5)) # grouping by year
    .layout(size=(8, 5))
)

bikes_daily["month"] = bikes_daily["month"].astype("category")
bikes_daily["month"] = bikes_daily["month"].cat.set_categories(
    ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
)

mod_r3 = smf.ols("registered ~ (temp + I(temp**2)) * year + month", data=bikes_daily).fit()

display(mod_r3.rsquared, mod_r2.rsquared)  # month 추가로 인한 R2가 증가분

0.7053133858977505

0.6681657121983411

print(mod_r3.summary(slim=True))

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             registered   R-squared:                       0.705
Model:                            OLS   Adj. R-squared:                  0.699
No. Observations:                 731   F-statistic:                     106.8
Covariance Type:            nonrobust   Prob (F-statistic):          5.06e-177
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                   180.4223    321.376      0.561      0.575    -450.533     811.377
year[T.2012]              -1303.6849    477.877     -2.728      0.007   -2241.898    -365.472
month[T.Feb]                 16.8031    163.242      0.103      0.918    -303.688     337.295
month[T.Mar]                 70.8817    178.488      0.397      0.691    -279.542     421.306
month[T.Apr]                291.0581    198.194      1.469      0.142     -98.054     680.170
month[T.May]                649.9687    223.994      2.902      0.004     210.202    1089.735
month[T.Jun]               1021.8253    248.544      4.111      0.000     533.861    1509.789
month[T.Jul]                853.7900    277.468      3.077      0.002     309.039    1398.541
month[T.Aug]                985.2750    256.593      3.840      0.000     481.508    1489.041
month[T.Sep]               1085.0183    229.970      4.718      0.000     633.521    1536.516
month[T.Oct]                995.3377    200.849      4.956      0.000     601.012    1389.663
month[T.Nov]                946.3073    178.028      5.315      0.000     596.786    1295.828
month[T.Dec]                568.6507    166.184      3.422      0.001     242.382     894.919
temp                       6120.3556   1655.969      3.696      0.000    2869.205    9371.506
temp:year[T.2012]           1.24e+04   2087.372      5.938      0.000    8297.289    1.65e+04
I(temp ** 2)              -3876.2985   1731.986     -2.238      0.026   -7276.693    -475.904
I(temp ** 2):year[T.2012] -1.105e+04   2082.530     -5.304      0.000   -1.51e+04   -6958.058
=============================================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

bikes_daily_resid["resid_r3"] = mod_r3.resid

(
    so.Plot(bikes_daily_resid, x='date', y='resid_r3')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
)

# 예측값과 실제값 비교
bikes_daily_resid["pred_r3"] = mod_r3.predict(bikes_daily)
(
    so.Plot(bikes_daily_resid, x='registered', y='pred_r3')
    .add(so.Dots())
    .add(so.Line(), y='registered')
    .facet("year")
    .layout(size=(8, 4))
)

outliers = bikes_daily_resid.query('resid_r3 < -2000')[["date", "holiday"]]
outliers

          date  holiday
238 2011-08-27        0
365 2012-01-01        0
448 2012-03-24        0
477 2012-04-22        0
499 2012-05-14        0
517 2012-06-01        0
567 2012-07-21        0
596 2012-08-19        0
610 2012-09-02        0
626 2012-09-18        0
645 2012-10-07        0
667 2012-10-29        0
668 2012-10-30        0
691 2012-11-22        1
692 2012-11-23        0
693 2012-11-24        0
723 2012-12-24        0
724 2012-12-25        1
725 2012-12-26        0

모형의 비교와 각 예측변수의 공헌도

mod_r_year = smf.ols("registered ~ year", data=bikes_daily).fit()
mod_r_temp = smf.ols("registered ~ year + temp + I(temp**2)", data=bikes_daily).fit()
mod_r_month = smf.ols("registered ~ year + temp + I(temp**2) + month", data=bikes_daily).fit()
mod_r_wday = smf.ols("registered ~ year + temp + I(temp**2) + month + wday", data=bikes_daily).fit()

print(
    f" year: {mod_r_year.rsquared:.3f}\n",
    f"year + temperature: {mod_r_temp.rsquared:.3f}\n",
    f"year + temperature + months: {mod_r_month.rsquared:.3f}\n",
    f"year + temperature + months + days of the week: {mod_r_wday.rsquared:.3f}",
)

 year: 0.353
 year + temperature: 0.653
 year + temperature + months: 0.686
 year + temperature + months + days of the week: 0.757

Casual bikers

(
    so.Plot(bikes_daily, x='temp', y='casual')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(5))
    .facet("year")
)

# log scale for y
(
    so.Plot(bikes_daily, x='temp', y='casual')
    .add(so.Dots())
    .add(so.Line(), so.PolyFit(2))
    .facet("year")
    .scale(y="log")  # log scale
)

# 온도를 5개 구간으로 나누어 각 온도구간에서의 분포를 살펴봄
bikes_daily["temp_cat"] = pd.qcut(bikes_daily["temp"], 5)

(
    so.Plot(bikes_daily, x='casual')
    .add(so.Bars(), so.Hist())
    .facet("temp_cat")
    .layout(size=(8, 4))
)

bikes_daily["temp_cat"] = pd.qcut(bikes_daily["temp"], 7)

bikes_daily.assign(casual2 = lambda x: x.casual / 100).groupby("temp_cat")["casual2"].agg(["mean", "var"])

                 mean   var
temp_cat                   
(0.0581, 0.281]  1.94  2.54
(0.281, 0.355]   4.33  9.42
(0.355, 0.447]   6.74 38.09
(0.447, 0.544]  10.11 46.18
(0.544, 0.635]  11.71 58.61
(0.635, 0.716]  12.67 42.25
(0.716, 0.862]  11.88 29.46

이렇게 count 데이터가 전형적으로 보이는 “평균에 따라 표준편차가 함께 증가”하는 경우, generalized linear model (GLM)의 하나인 포아송(Poisson) 분포나 이를 보정한 다른 분포를 결합한 모형을 이용하는 것이 권장됨.
Y를 transform하는 것보다 여러 이점이 있는 한편, Y가 내재적으로 log나 sqrt의 의미를 띤다면 transform하는 것이 더 적절할 수도 있음.

bikes_daily["lcasual"] = np.log2(bikes_daily["casual"] + 1)

mod_c = smf.ols("lcasual ~ (temp + I(temp**2)) + year", data=bikes_daily).fit()

bikes_daily_resid["resid_c"] = mod_c.resid
bikes_daily_resid["lcasual"] = np.log2(bikes_daily_resid["casual"] + 1)

(
    so.Plot(bikes_daily_resid, x='date', y='resid_c')
    .add(so.Dots())
    .add(so.Line(), so.Spline(10))
    .limit(y=(-3, 2))
)

# 요일에 따라 차이가 있는가?
from sbcustom import boxplot
(
    boxplot(bikes_daily, x="wday", y="casual")
    .facet("year")
    .layout(size=(8, 4))
)

요일의 차이가 추가적으로 설명할 수 있는 부분이 있는가?

(
    so.Plot(bikes_daily_resid, x='date', y='resid_c')
    .add(so.Dots())
    .add(so.Line(), so.Spline(5))
    .add(so.Line(color=".6"), so.Agg(lambda x: 0))
    .limit(y=(-3, 2))
    .facet("wday", wrap=4)
    .layout(size=(9, 5))
)

(
    so.Plot(bikes_daily_resid, x='date', y='resid_c')
    .add(so.Dots())
    .add(so.Line(), so.Spline(5))
    .add(so.Line(color=".6"), so.Agg(lambda x: 0))
    .limit(y=(-3, 2))
    .facet("workingday")
    .layout(size=(9, 5))
    .label(title="workingday: {}".format)
)

요일별로 모두 고려할 것인가? 아니면 평일/주말로 나눌 것인가?

# 요일별로
mod_c2 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday", data=bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared)

0.5540585294121951

0.7283361326294607

# 평일/주말
mod_c2_1 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + workingday", data=bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared, mod_c2_1.rsquared)

0.5540585294121951

0.7283361326294607

0.7242260371575533

# 습도 추가
mod_c3 = smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily).fit()
display(mod_c.rsquared, mod_c2.rsquared, mod_c3.rsquared)

0.5540585294121951

0.7283361326294607

0.8008034311536445

mod_c3_robust = sm.RLM.from_formula("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily).fit()

마지막 모형(mod_c2)의 예측값
smf.ols("lcasual ~ (temp + I(temp**2)) + year + wday + bs(hum, 4)", data=bikes_daily)

predictions

temp = np.linspace(bikes_daily["temp"].min(), bikes_daily["temp"].max(), 30)
hum = np.array([bikes_daily["hum"].mean()])
year = np.array(["2011", "2012"])
wday = np.array(["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])

from itertools import product
grid = pd.DataFrame(
    list(product(temp, year, wday, hum)),
    columns=["temp", "year", "wday", "hum"],
)
grid["log_pred_c"] = mod_c3.predict(grid)

(
    so.Plot(grid, x='temp', y='log_pred_c', color="year")
    .add(so.Dots())
    .facet("wday", wrap=4)
    .label(title="Pred of mod_c2 {}".format)
    .layout(size=(9, 5))
)

Original scale로 예측값

predictions with the original scale

temp = np.linspace(bikes_daily["temp"].min(), bikes_daily["temp"].max(), 30)
hum = np.array([bikes_daily["hum"].mean()])
year = np.array(["2011", "2012"])
wday = np.array(["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])

from itertools import product
grid = pd.DataFrame(
    list(product(temp, year, wday, hum)),
    columns=["temp", "year", "wday", "hum"],
)
grid["pred_c"] = 2**mod_c3.predict(grid) - 1

(
    so.Plot(grid, x='temp', y='pred_c', color="year")
    .add(so.Dots())
    .facet("wday", wrap=4)
    .label(title="Pred of mod_c2 {}".format)
    .layout(size=(9, 5))
)

잔차 확인

bikes_daily_resid["resid_c2"] = mod_c2.resid
bikes_daily_resid["resid_c3"] = mod_c3.resid

residual plot for mod_c2; 습도 추가 전

(
    so.Plot(bikes_daily_resid, x='temp', y='resid_c2')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.Spline(4))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-3, 2))
)

residual plot for mod_c3; 습도까지 포함

(
    so.Plot(bikes_daily_resid, x='temp', y='resid_c3')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.Spline(4))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-3, 2))
)

residual with the original scale

bikes_daily_resid = bikes_daily.assign(
    pred_c3_revert = lambda x: 2**mod_c3.predict(x) - 1,
    resid_c3_revert = lambda x: x["casual"] - x["pred_c3_revert"]
)

(
    so.Plot(bikes_daily_resid, x='day', y='resid_c3_revert')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-1000, 1000))
)

Poissong regresion residuals

# poisson regression
mod_c3_pois = smf.poisson("casual ~ (temp + I(temp**2)) + year + wday + bs(hum, 5)", data=bikes_daily).fit()
bikes_daily_resid["resid_c3_pois"] = mod_c3_pois.resid
bikes_daily_resid["pred_c3_pois"] = mod_c3_pois.predict(bikes_daily)
(
    so.Plot(bikes_daily_resid, x='day', y='resid_c3_pois')
    .add(so.Dots(color='.6'))
    .add(so.Line(), so.PolyFit(5))
    .add(so.Line(color=".2"), so.Agg(lambda x: 0))
    .facet("year")
    .layout(size=(7, 5))
    .limit(y=(-1000, 1000))
)

Optimization terminated successfully.
         Current function value: 49.152525
         Iterations 7

# 예측값과 실제값 비교
(
    so.Plot(bikes_daily_resid, x='casual', y='pred_c3_revert')
    .add(so.Dots())
    .add(so.Line(color=".3"), y='casual')
    .facet("year")
)

# 예측값과 실제값 비교: Poisson regression
(
    so.Plot(bikes_daily_resid, x='casual', y='pred_c3_pois')
    .add(so.Dots())
    .add(so.Line(color=".3"), y='casual')
    .facet("year")
)

모형의 비교와 각 예측변수의 공헌도

mod_c_year = smf.ols("lcasual ~ year", data=bikes_daily).fit()
mod_c_temp = smf.ols("lcasual ~ year + temp + I(temp**2)", data=bikes_daily).fit()
mod_c_month = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month", data=bikes_daily
).fit()
mod_c_hum = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4)", data=bikes_daily
).fit()
mod_c_wday = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4) + wday", data=bikes_daily
).fit()

print(
    f" year: {mod_c_year.rsquared:.3f}\n",
    f"year + temperature: {mod_c_temp.rsquared:.3f}\n",
    f"year + temperature + months: {mod_c_month.rsquared:.3f}\n",
    f"year + temperature + months + humidity: {mod_c_hum.rsquared:.3f}\n",
    f"year + temperature + months + days of the week: {mod_c_wday.rsquared:.3f}",
)

 year: 0.057
 year + temperature: 0.554
 year + temperature + months: 0.588
 year + temperature + months + humidity: 0.660
 year + temperature + months + days of the week: 0.824

예측값을 original scale로 다시 되돌려 R squared 계산

R-squared with the original scale

bikes_daily_resid2 = bikes_daily.assign(
    pred_c_year = lambda x: 2**mod_c_year.fittedvalues - 1,
    pred_c_temp = lambda x: 2**mod_c_temp.fittedvalues - 1,
    pred_c_month = lambda x: 2**mod_c_month.fittedvalues - 1,
    pred_c_hum = lambda x: 2**mod_c_hum.fittedvalues - 1,
    pred_c_wday = lambda x: 2**mod_c_wday.fittedvalues - 1,
)

from sklearn.metrics import r2_score

for col in ["pred_c_year", "pred_c_temp", "pred_c_month", "pred_c_hum",  "pred_c_wday"]:
    r2 = r2_score(bikes_daily_resid2["casual"], bikes_daily_resid2[col])
    print(f"{col}: {r2:.2f}")

pred_c_year: -0.08
pred_c_temp: 0.34
pred_c_month: 0.37
pred_c_hum: 0.42
pred_c_wday: 0.80

포아송 회귀로 살펴보면,
R-squared가 존재하지 않고, 대신 pseudo R-squared가 있음; 절편만 있는 모형 대비 log-likelihood가 얼마나 늘었는지의 비율로 구현

X의 값에 따라 Y의 분포가 달라지는 경우, R-squared는 의미가 없음
데이터를 train/test로 나누어 평가하는 것이 더 적절함

mod_c2_pois = smf.poisson("casual ~ (temp + I(temp**2)) + year + wday", data=bikes_daily).fit()

mod_c_pois_year = smf.poisson("casual ~ year", data=bikes_daily).fit()
mod_c_pois_temp = smf.poisson("casual ~ year + temp + I(temp**2)", data=bikes_daily).fit()
mod_c_pois_month = smf.poisson(
    "casual ~ year + temp + I(temp**2) + month", data=bikes_daily
).fit()
mod_c_pois_hum = smf.ols(
    "lcasual ~ year + temp + I(temp**2) + month + bs(hum, 4)", data=bikes_daily
).fit()
mod_c_pois_wday = smf.poisson(
    "casual ~ year + temp + I(temp**2) + month + bs(hum, 4) + wday", data=bikes_daily
).fit()

print(
    f" year: {mod_c_pois_year.prsquared:.3f}\n",  # prsquared: pseudo R-squared
    f"year + temperature: {mod_c_pois_temp.prsquared:.3f}\n",
    f"year + temperature + months: {mod_c_pois_month.prsquared:.3f}\n",
    f"year + temperature + months + humidity: {mod_c_pois_hum.rsquared:.3f}\n",
    f"year + temperature + months + days of the week: {mod_c_pois_wday.prsquared:.3f}",
)

Optimization terminated successfully.
         Current function value: 60.868155
         Iterations 7
Optimization terminated successfully.
         Current function value: 245.676778
         Iterations 5
Optimization terminated successfully.
         Current function value: 140.141516
         Iterations 6
Optimization terminated successfully.
         Current function value: 130.926793
         Iterations 7
Optimization terminated successfully.
         Current function value: 43.674208
         Iterations 7
 year: 0.066
 year + temperature: 0.467
 year + temperature + months: 0.502
 year + temperature + months + humidity: 0.660
 year + temperature + months + days of the week: 0.834