Misc.

Mixed

Author

Sungkyun Cho

Published

April 22, 2024

Load packages
# numerical calculation & data frames
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so

# statistics
import statsmodels.api as sm

# pandas options
pd.set_option('mode.copy_on_write', True)  # pandas 2.0
pd.options.display.float_format = '{:.2f}'.format  # pd.reset_option('display.float_format')
pd.options.display.max_rows = 7  # max number of rows to display

# NumPy options
np.set_printoptions(precision = 2, suppress=True)  # suppress scientific notation

# For high resolution display
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

값을 대체하는 방식

flights = pd.read_csv('data/flights.csv')
# np.where
flights["season"] = np.where(flights["month"].isin([6, 7]), "summer", "other month")

# np.where in assign()
flights.assign(
    season = lambda x: np.where(x.month.isin([6, 7]), "summer", "other month")
)

# apply wifh if-else or function
flights["month"].apply(lambda x: "summer" if x in [6, 7] else "other month")

# map with dictionary
flights["month"].map({6: "summer", 7: "summer"}).fillna("other month")

# pd.eval: query expression을 활용
flights.assign(
    season = lambda x: np.where(pd.eval('x.month in [6, 7]'), "summer", "other month")
)

# appply with match
def get_season(mth):
    match mth:
        case 6 | 7:
            return "summer"
        case _:
            return "other month"

flights["month"].apply(get_season)

Vectorised operation의 효율성

names = pd.read_csv("data/babynames.csv")
df = names[:100_000]

vectorised operation vs. apply()

값을 대체할 때: np.where

import time

# np.where
start_time = time.time()

names["births3"] = np.where(names["births"] < 100, 0, names["births"])

end_time = time.time()
diff_x = end_time - start_time
print(f"vectorized: {diff_x}")

# apply
start_time = time.time()

names["births2"] = names["births"].apply(lambda x: 0 if x < 100 else x)

end_time = time.time()
diff_y = end_time - start_time
print(f"apply: {diff_y}")

print(f"ratio: {diff_y / diff_x}")
vectorized: 0.008668899536132812
apply: 0.34128403663635254
ratio: 39.368784378437844

vectorised vs. apply()

비율을 구할 때

start_time = time.time()

df["births"] / df.groupby(["name", "sex"])["births"].transform("sum")

end_time = time.time()
diff_x = end_time - start_time
print(f"vectorized: {diff_x}")

start_time = time.time()

df.groupby(["name", "sex"])["births"].apply(lambda x: x / x.sum())

end_time = time.time()
diff_y = end_time - start_time
print(f"apply: {diff_y}")

print(f"ratio: {diff_y / diff_x}")
vectorized: 0.058942317962646484
apply: 3.583096742630005
ratio: 60.7898852043912