pandas

import

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder

read file

train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")
holidays = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")

change to datetime

train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_week'] = train['date'].dt.dayofweek
train['weekend'] = train['day_of_week'].isin([5, 6]).astype(int) 

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_week'] = test['date'].dt.dayofweek
test['weekend'] = test['day_of_week'].isin([5, 6]).astype(int) 

sort by date

train = train.sort_values(by='date',ascending=True)

encoder

la = LabelEncoder()
train.family = la.fit_transform(train.family)
test.family = la.fit_transform(test.family)

merge

train_with_stores = pd.merge(train, stores, on="store_nbr", validate="many_to_one")
test_with_stores = pd.merge(test, stores, on="store_nbr", validate="many_to_one")

train_stores_holidays = pd.merge(train_with_stores, holidays, on=["date","city"], how="left")
test_stores_holidays = pd.merge(test_with_stores, holidays, on=["date","city"], how="left")

head

train_with_stores.head(10)

info

train_with_stores.info()

rename column

holidays.rename(columns={'locale_name': 'city', 'type': 'day_type'}, inplace=True)

update on condition

holidays.loc[holidays["transferred"] == True, "day_type"] = "Work Day"
#holidays.loc[holidays["transferred"] == True]
holidays.loc[holidays["day_type"] != "Work Day", "day_type"] = "Holiday"

aggregate

#agg_functions = {'day_type': 'first', 'city': 'first'}
#holidays1 = holidays.groupby(['date']).aggregate(agg_functions).reset_index()

drop

holidays.drop(columns=["description","locale","transferred"],inplace=True)

drop duplicates

holidays = holidays.drop_duplicates()

check dup

#holidays1.equals(holidays2)
holidays[holidays.duplicated(subset=["date","city"],keep=False)]

dtype

holidays['date'].dtype

fillna

train_stores_holidays['day_type'] = train_stores_holidays['day_type'].fillna("Work Day")
test_stores_holidays['day_type'] = test_stores_holidays['day_type'].fillna("Work Day")

line plot

sns.lineplot(x="store_nbr", y="sales", data=train_stores_holidays)

bar plot

p = train_stores_holidays.groupby('day_type')['sales'].mean()
p.plot(kind='bar')
plt.title('Sales by day_type')
plt.xlabel('day_type')
plt.ylabel('Mean Sales')
plt.show()

check nan

train_stores_holidays[train_stores_holidays.isna().any(axis=1)]

上篇fastai notebook - NLP

下篇import