# importing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import datetime
pd.options.mode.chained_assignment = None
# loading data
data = pd.read_csv("expenses.csv", parse_dates=["st_date","t_date"])
data.sort_values(by=["t_date", "st_date"], inplace=True) # Give the sorted data
data_copy = data.copy()
data.head(10)
# resetting index
print ("Length: ", len(data))
print ("Number of columns: ", len(data.columns))
print
print ("Range of the data is from: ", data.t_date.min(), "to this: ", data.t_date.max())
print ("The maximum amount in the account is: ", "$",data.total_amt.max())
print ("Mode: ", data.total_amt.mode())
print ("Median: ", data.total_amt.median())
print ("Mean: ", data.total_amt.mean())
print ("Minimum amount in the account is: $", data.total_amt.min())
print()
print ("The first transaction to the account was: $", data.t_amount.values[0], "at: ", data["tran_details"][data["t_amount"] == data["t_amount"].values[0]])
# data preprocessing functions
days = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6}
# convert date to day of the week
def convert_date_to_week(date):
year, month, day = (int(x) for x in date.split('/'))
ans = datetime.date(year, month, day)
return ans.strftime("%A")
data.set_index(data_copy.tran_details, inplace=True)
# removing the highest amount as it was deposited.
data.drop("Card Reload F0566I19SDD22005", inplace=True, axis=0)
data.drop("Card Reload F0566I19SCX88728", inplace=True, axis=0)
# resetting index
data.head()
# Reset the index
data.reset_index(drop=True, inplace=True)
# removing settlement date, reference number, curr, s_amt, curr_2, fee_amt, fee_gst, cross_charges, gst_cross, d_c
data.drop(columns=["st_date", "ref_num","total_amt","curr","s_amt","curr_2", "fee_amt", "fee_gst", "cross_charges", "gst_cross", "D_C"], axis=1,inplace=True)
data.head()
threshold = 10.0 # For a student, spending more than or equal to $10 is considered expensive (can be changed)
data["is_place_duplicated"] = data["tran_details"].duplicated()
"""
return all the duplicated places.
"""
def get_duplicated_places(data, key="is_place_duplicated"):
places = []
for place, t_f in zip(data["tran_details"],data[key]):
if t_f:
places.append(place)
else:
pass
return places
# get the count for each place.
def get_count():
places = {"place":[], "count":[]}
for place, count in zip(data.tran_details.value_counts().keys(), data.tran_details.value_counts().values):
places["place"].append(place)
places["count"].append(count)
return places
for place in (get_duplicated_places(data)):
print (place)
data.head()
print ("Max duplicated place: ", data.tran_details.value_counts().max(), (get_count()["place"][0]))
pd.DataFrame(get_count()).plot.bar(x="place",y="count")
# Total money spent at each place.
# We removed these two keys as these were the money reloaded texts.
data.groupby("tran_details")["t_amount"].sum()
# the max money spent
print ("Max money spent: ", data.groupby("tran_details")["t_amount"].sum().max())
data.groupby("tran_details")["t_amount"].sum()
print ("Min money spent: ", data.groupby("tran_details")["t_amount"].sum().min())
print ("Mean money spent: ", data.groupby("tran_details")["t_amount"].sum().mean())
# create a new column (perform feature engineering)
def create_new_column():
# replacing values of tran_details where t_amount is >= 10.0 with 1 or else 0 or we can use np.where()
(data.tran_details[data.t_amount>=10.0]) = 1
(data.tran_details[data.t_amount<10.0]) = 0
create_new_column() # create the new column
data.head()
data["is_place_duplicated"]=data["is_place_duplicated"].map({False:0, True:1}) # changing the is_place_duplicated
data.head()
sns.lineplot(x="t_date",y="t_amount", data=data, hue="is_place_duplicated")
sns.scatterplot(x="t_amount",y="t_date", data=data,hue="is_place_duplicated" )
sns.countplot(x="tran_details",data=data, hue="tran_details")
data.groupby("tran_details")["t_amount"].sum()
data.groupby("tran_details")["t_amount"].mean()
data.groupby("tran_details")["t_amount"].max()
data.head()
data.t_date[data.t_amount == np.max(data.t_amount.values)]
data.t_date[data.t_amount == np.min(data.t_amount.values)]
data.head()
"""
convert()
convert the date in to day.
"""
def convert(data):
days = []
for date in data["t_date"]:
date = str(date).replace("00:00:00","")
date = date.replace("-","/")
days.append(convert_date_to_week(date))
return days
# convert the column
data["t_date"] = convert(data)
# convert into mapping
data["t_date"] = data["t_date"].map(days)
data.head()
data.corr()
sns.relplot(x="t_date", y="t_amount",data=data)
sns.relplot(x="tran_details", y="t_amount",data=data)
sns.relplot(x="is_place_duplicated", y="t_amount",data=data)
sns.pairplot(data,hue="tran_details")
features = data.drop("t_amount", axis=1).values
labels = data.t_amount.values
# splitting data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, shuffle=True, random_state=42)
classifiers = {
"linear regression": LinearRegression(),
"svr": SVR(),
"random forest":RandomForestRegressor(n_estimators=5),
"Decision tree":DecisionTreeRegressor(),
"BaggingRegressor":BaggingRegressor(),
"Gradient Boosting":GradientBoostingRegressor(),
}
def train():
for i, classifier in enumerate(classifiers):
print ("Training with: ", i)
classifiers.get(classifier).fit(features_train, labels_train)
print("Training score: ", classifiers.get(classifier).score(features_train ,labels_train))
print("Testing score: ", classifiers.get(classifier).score(features_test ,labels_test))
print()
train()
demo = np.array([6, 1, 0])
print ("Estimated cost on a Saturday is: $",classifiers["random forest"].predict(demo.reshape(1,-1))[0])