Misc.

Mixed

Author

Sungkyun Cho

Published

April 8, 2025

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

값을 대체하는 방식

flights = pd.read_csv('../data/flights.csv')
# np.where
flights["season"] = np.where(flights["month"].isin([6, 7]), "summer", "other month")

# np.where in assign()
flights.assign(
    season = lambda x: np.where(x.month.isin([6, 7]), "summer", "other month")
)

# apply wifh if-else or function
flights["month"].apply(lambda x: "summer" if x in [6, 7] else "other month")

# map with dictionary
flights["month"].map({6: "summer", 7: "summer"}).fillna("other month")

# pd.eval: query expression을 활용
flights.assign(
    season = lambda x: np.where(pd.eval('x.month in [6, 7]'), "summer", "other month")
)

# appply with match
def get_season(mth):
    match mth:
        case 6 | 7:
            return "summer"
        case _:
            return "other month"

flights["month"].apply(get_season)
0         other month
1         other month
2         other month
             ...     
336773    other month
336774    other month
336775    other month
Name: month, Length: 336776, dtype: object

Vectorised operation의 효율성

names = pd.read_csv("../data/babynames.csv")
df = names[:100_000]

vectorised operation vs. apply()

값을 대체할 때: np.where

import time

# np.where
start_time = time.time()

names["births3"] = np.where(names["births"] < 100, 0, names["births"])

end_time = time.time()
diff_x = end_time - start_time
print(f"vectorized: {diff_x}")

# apply
start_time = time.time()

names["births2"] = names["births"].apply(lambda x: 0 if x < 100 else x)

end_time = time.time()
diff_y = end_time - start_time
print(f"apply: {diff_y}")

print(f"ratio: {diff_y / diff_x}")
vectorized: 0.01050114631652832
apply: 0.3479013442993164
ratio: 33.12984447723919
apply: 0.3479013442993164
ratio: 33.12984447723919

vectorised vs. apply()

비율을 구할 때

start_time = time.time()

df["births"] / df.groupby(["name", "sex"])["births"].transform("sum")

end_time = time.time()
diff_x = end_time - start_time
print(f"vectorized: {diff_x}")

start_time = time.time()

df.groupby(["name", "sex"])["births"].apply(lambda x: x / x.sum())

end_time = time.time()
diff_y = end_time - start_time
print(f"apply: {diff_y}")

print(f"ratio: {diff_y / diff_x}")
vectorized: 0.033573150634765625
apply: 3.183194875717163
ratio: 94.81370724917623