import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")
holidays = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_week'] = train['date'].dt.dayofweek
train['weekend'] = train['day_of_week'].isin([5, 6]).astype(int)
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_week'] = test['date'].dt.dayofweek
test['weekend'] = test['day_of_week'].isin([5, 6]).astype(int)
train = train.sort_values(by='date',ascending=True)
la = LabelEncoder()
train.family = la.fit_transform(train.family)
test.family = la.fit_transform(test.family)
train_with_stores = pd.merge(train, stores, on="store_nbr", validate="many_to_one")
test_with_stores = pd.merge(test, stores, on="store_nbr", validate="many_to_one")
train_stores_holidays = pd.merge(train_with_stores, holidays, on=["date","city"], how="left")
test_stores_holidays = pd.merge(test_with_stores, holidays, on=["date","city"], how="left")
train_with_stores.head(10)
holidays.rename(columns={'locale_name': 'city', 'type': 'day_type'}, inplace=True)
holidays.loc[holidays["transferred"] == True, "day_type"] = "Work Day"
#holidays.loc[holidays["transferred"] == True]
holidays.loc[holidays["day_type"] != "Work Day", "day_type"] = "Holiday"
#agg_functions = {'day_type': 'first', 'city': 'first'}
#holidays1 = holidays.groupby(['date']).aggregate(agg_functions).reset_index()
holidays.drop(columns=["description","locale","transferred"],inplace=True)
holidays = holidays.drop_duplicates()
#holidays1.equals(holidays2)
holidays[holidays.duplicated(subset=["date","city"],keep=False)]
train_stores_holidays['day_type'] = train_stores_holidays['day_type'].fillna("Work Day")
test_stores_holidays['day_type'] = test_stores_holidays['day_type'].fillna("Work Day")
sns.lineplot(x="store_nbr", y="sales", data=train_stores_holidays)
p = train_stores_holidays.groupby('day_type')['sales'].mean()
p.plot(kind='bar')
plt.title('Sales by day_type')
plt.xlabel('day_type')
plt.ylabel('Mean Sales')
plt.show()
train_stores_holidays[train_stores_holidays.isna().any(axis=1)]