kaggleの'Restaurant Recommendation Challenge’で、レコメンデーションの実装をしてみた。
今回の目的
機械学習・レコメンデーションの実装
アウトプットの活用定義(5W1H)
who 飲食店サイトが
when 経営会議で
where ー
why サービス利用者のUI向上するために
what おすすめのレストランを
how ユーザーAにはレストランXをおすすめするのが良いという感じで
実装手順
- ライブラリ、データのインポート
- EDA
- データプロセッシング
- k近傍法による学習モデルの構築
実装コード
# ライブラリのインポート
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import missingno as msno
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
# データの読み込み
data1=pd.read_csv("/content/drive/MyDrive/kaggle/Restaurant Recommendation Challenge/train_full.csv")
data1.head()
# データ型を把握
data1.info()
# 主要な列のみを抽出
main_df=data1[['gender','location_type','language','OpeningTime','city_id','vendor_rating']]
msno.matrix(main_df)
# gender列の比率と、欠損値の割合を計算
gender_null = np.count_nonzero(data1['gender'].isnull())
print(gender_null)
gender_null/data1.shape[0]
# 男女比を可視化
#location type列の比率と、欠損値の割合を計算
location_null = np.count_nonzero(data1['location_type'].isnull())
print(location_null)
print("null Ratio : ", location_null/data1.shape[0])
# 飲食場所の比率を可視化
# 男女ごとの、飲食場所の比率を可視化
# 分かること
# 男性は家で食べることが多い一方で、女性はほとんど差がない
# language列の値、欠損値の比率を計算
null = np.count_nonzero(data1['language'].isnull())
print(null)
null/data1.shape[0]
# OpeningTime列の値、欠損値の比率を計算
null = np.count_nonzero(data1['OpeningTime'].isnull())
print(null)
null/data1.shape[0]
# 店の評価の値、欠損値の比率を表示
null = np.count_nonzero(data1['vendor_rating'].isnull())
print(null)
null/data1.shape[0]
# 店の評価のバラつきを可視化
# orders.csvの中身を確認
data2.head()
data2.info()
# 分かること
# orders.csvは、レストランの注文履歴
# vendor_rating列の値、欠損値の比率を表示
null = np.count_nonzero(data2['vendor_rating'].isnull())
print(null)
null/data2.shape[0]
# 可視化
# レストランの数を表示
# 顧客の数を表示
# データプロセッシング
# trainデータから新たにdatasetをつくる
dataset1 = data1[['customer_id','gender','location_type','id','OpeningTime','language','vendor_rating','serving_distance','vendor_tag_name','delivery_charge']]
dataset1.rename(columns = {"vendor_rating": "mean_rating"}, inplace = True)
# 顧客IDと店舗IDを統合した列を作る
cols = ['customer_id', 'id']
dataset1['all'] = dataset1[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dataset1.head()
# 重複行を削除
# 注文履歴データから新たにdatasetをつくる
dataset2 = data2[['akeed_order_id','customer_id','vendor_id', 'item_count', 'grand_total', 'vendor_rating']][:]
dataset2.rename(columns = {"vendor_id": "id"}, inplace = True)
# 顧客IDと店舗IDを統合した列を作る
cols = ['customer_id', 'id']
dataset2['all'] = dataset2[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dataset2.head()
# 新たに作ったDatasetの大きさを確認
print(dataset1.shape)
print(dataset2.shape)
# 2つのDatasetを内部結合
df1=pd.merge(dataset1,dataset2,on='all',how='inner')
df1.head()
df1.shape
# ちょっと加工
df1.rename(columns = {"customer_id_x": "customer_id", "id_x": "vendor_id"}, inplace = True)
# 顧客ID,店舗ID,店舗のタグの3列から新たなDataframeを作る
df2=dataset1[['customer_id','id','vendor_tag_name']]
df2.rename(columns={'id':'vendor_id'},inplace=True)
df2.head()
# Data Cleaning
# 対象の列に含まれている欠損値の割合を計算
cols=[ 'serving_distance', 'delivery_charge','item_count', 'grand_total', 'vendor_rating']
def null_check(x):
# print(df1_train[x].value_counts())
null = np.count_nonzero(df1[x].isnull())
print(null)
return null/df1.shape[0]
for i in cols:
print(i,'null ratio :', null_check(i))
# 言語列を削除
# 性別列が欠損値の行を削除
# 性別列を量的変数に変換
df1=pd.concat([df1,sex],axis=1)
# 店の評価列の名前を変更
df1.rename(columns={'vendor_rating': 'rating'}, inplace=True)
print(df1.shape)
df1.head()
# 後のモデルに使うためのDataframeを作成
df1_train_for_anal = df1[:]
# Openingtime列を開店時間、閉店時間に分けて新たに列を作る
train_contents['OpeningTime'].fillna('-',inplace=True)
time_split= train_contents.OpeningTime.str.split('-')
open=time_split.str.get(0)
close=time_split.str.get(1)
train_contents['Open']=open
train_contents['Close']=close
train_contents['Open'].fillna('',inplace=True)
train_contents['Close'].fillna('',inplace=True)
print(train_contents['Open'].unique())
print(train_contents['Close'].unique())
# 開店時間を朝昼夜で分ける
def morning_func(x) :
if x == "" :
return None
else :
x1 = int(x[:2].replace(":", "").replace("a", ""))
x2 = x[-2:]
if (x1>=7 and x1 <= 10) and x2 == ("AM" or "am"):
return 1
elif x1 <=10 and len(x) <= 2 :
return 1
else :
return 0
def afternoon_func(x) :
if x == "" :
return None
else :
x1 = int(x[:2].replace(":", "").replace("a", ""))
x2 = x[-2:]
if x1 <= 1 and x2 == "PM":
return 1
elif x1 == 12 and x2 == "PM":
return 1
elif x2 == ("AM" or "am"):
return 1
elif x1 <=10 and len(x) <= 2 :
return 1
else :
return 0
def evening_func(x) :
if x == "" :
return None
else :
x1 = int(x[:2].replace(":",""))
x2 = x[-2:]
if (x1 >= 6 and x2 == "PM") or x2 == ("Am" or "am") :
return 1
elif x1 >= 22 and len(x)<=2:
return 1
else :
return 0
train_contents["morning"] = train_contents["Open"].apply(morning_func)
train_contents["afternoon"] = train_contents["Open"].apply(afternoon_func)
train_contents["evening"] = train_contents["Close"].apply(evening_func)
train_contents.head()
# 'vendor_tag_name'列の欠損値を表示
null = np.count_nonzero(train_contents['vendor_tag_name'].isnull())
print(null)
print(null/train_contents.shape[0]) #1%
# 欠損値を含む行を削除
null = np.count_nonzero(train_contents['vendor_tag_name'].isnull())
print(null)
# 提供メニューをリストに格納
train_contents['vendor_tag_name']=train_contents['vendor_tag_name'].apply(lambda x:x.lower())
train_contents['vendor_tag']= train_contents['vendor_tag_name'].str.split(',')
train_contents['vendor_tag'].head()
# 朝昼夜列の値を作成
# morning列
def breakfast1(tag,x2):
if any('breakfast' in i for i in tag) and np.isnan(x2)==True :
return 1
# elif any('breakfast' in i for i in tag) and int(x2)==0 :
# return 1
else:
return x2
train_contents['mor2']=train_contents.apply(lambda x: breakfast1(x['vendor_tag'],x['morning']),axis=1)
# afternoon列
for i in range(len(train_contents['afternoon'])):
if (np.isnan(train_contents['morning'][i])==True) and (train_contents['mor2'][i]==1.0) :
train_contents['afternoon'][i]=1
else:
pass
#evening列
for i in range(len(train_contents['evening'])):
if (np.isnan(train_contents['morning'][i])==True) and (train_contents['mor2'][i]==1.0) :
train_contents['evening'][i]=0
else:
pass
# 欠損値を確認
null = np.count_nonzero(train_contents['mor2'].isnull())
print(null)
print(null/train_contents.shape[0])
train_contents.rename(columns={'mor2':'morning'},inplace=True)
train_contents.head()
# Collaborative Filtering
# 必要な列を取り出す
cus_ven_ratings = df1_train_for_anal[['customer_id', 'vendor_id', 'rating']]
cus_ven_ratings
# 欠損値、0を除く店の評価で配列を作る
ratings_not_none = []
for i in range(0, cus_ven_ratings.shape[0]-1) :
if pd.isnull(cus_ven_ratings.iloc[i][2]) == False and cus_ven_ratings.iloc[i][2] != 0 :
ratings_not_none.append(cus_ven_ratings.iloc[i][2])
valid_rating_mean = np.mean(np.array(ratings_not_none))
# 欠損値を店の評価列の平均で埋める
def rating_missing_func(x) :
if pd.isnull(x) == True :
return valid_rating_mean
elif x == 0 :
return valid_rating_mean
else :
return x
cus_ven_ratings["rating2"] = cus_ven_ratings["rating"].apply(rating_missing_func)
cus_ven_ratings
# dataframeの列名を変更
cus_ven_ratings = cus_ven_ratings[['customer_id', 'vendor_id', 'rating2']]
cus_ven_ratings.rename(columns={'rating2':'rating', 1:'customer_id_num'}, inplace=True)
cus_ven_ratings
# 顧客ID、てんぽID でグループ化して、評価の平均値を計算
cus_ven_ratings_mean = cus_ven_ratings.groupby(['customer_id', 'vendor_id']).mean()
cus_ven_ratings_mean
# インデックスを連番にした新たなDataframeを作成
df_cus_ven_ratings_mean = cus_ven_ratings_mean.reset_index()
df_cus_ven_ratings_mean
# モデル作成のために、ピボットテーブルに変換
rating_full_matrix = df_cus_ven_ratings_mean.pivot(index='customer_id', columns='vendor_id', values='rating')
rating_full_matrix
# sklearnライブラリのcosine_similarityモジュールを使って、顧客間の類似度を計算
rating_matrix_dummy = rating_full_matrix.copy().fillna(0)
customer_similarity = cosine_similarity(rating_matrix_dummy, rating_matrix_dummy)
customer_similarity = pd.DataFrame(customer_similarity, index = rating_full_matrix.index, columns=rating_full_matrix.index)
customer_similarity
# RMSEで学習モデルを評価する
def RMSE(y_true, y_pred):
def knn_score(model, neigbor_size=0) :
id_pairs = zip(df_cus_ven_ratings_mean['customer_id'], df_cus_ven_ratings_mean['vendor_id'])
y_pred = np.array([model(customer, vendor, neigbor_size) for (customer, vendor) in id_pairs])
y_true = np.array(df_cus_ven_ratings_mean['rating'])
return RMSE(y_true, y_pred)
# k近傍法を使って店の評価を分類する
def cf_knn(customer_id, vendor_id, neighbor_size=0):
if vendor_id in rating_full_matrix:
# Similarity of inputted customer and other customer
sim_scores = customer_similarity[customer_id].copy()
# Ratings by all customers for inputted vendor(restaurant)
vendor_ratings = rating_full_matrix[vendor_id].copy()
# Index of customers who are not rate inputted vendor
none_rating_idx = vendor_ratings[vendor_ratings.isnull()].index
# Exception rating(null) which of customers who are not rate inputted vendor
vendor_ratings = vendor_ratings.drop(none_rating_idx)
# Exception similarity which of customers who are not rate inputted vendor
sim_scores = sim_scores.drop(none_rating_idx)
# Case that neighbor size is not specified
if neighbor_size == 0:
# Weighted mean of ratings by customers who rate inputted vendor
mean_rating = np.dot(sim_scores, vendor_ratings) / sim_scores.sum()
# Case that neighbor size is specified
else:
# Case that 2 or more people rate inputted vendor
if len(sim_scores) > 1:
# Minimum value among inputted neighbor size and number of customers who rate inputted vendor
neighbor_size = min(neighbor_size, len(sim_scores))
# transpose to Numpy array for using argsort
sim_scores = np.array(sim_scores)
vendor_ratings = np.array(vendor_ratings)
# Sorting similarity
customer_idx = np.argsort(sim_scores)
# Similarity as much as neighbor size
sim_scores = sim_scores[customer_idx][-neighbor_size:]
# Ratings as much as neighbor size
vendor_ratings = vendor_ratings[customer_idx][-neighbor_size:]
# Caculate final predicted ranting
mean_rating = np.dot(sim_scores, vendor_ratings) / sim_scores.sum()
else:
# Substitute to valid mean in other case
mean_rating = valid_rating_mean
else:
# Substitute to valid mean in other case
mean_rating = valid_rating_mean
return mean_rating
knn_score(cf_knn, neigbor_size=20)
# Function which present recommendation list for certain customer by CF
# 特定の顧客にレストランをおすすめするモデルを作成
def cf_recom_vendor(customer_id, n_items, neighbor_size=0):
# Vendors which rated by inputted customer
customer_vendor = rating_full_matrix.loc[customer_id].copy()
for vendor in rating_full_matrix:
# Excepton vendors which already rated by inputted customer
if pd.notnull(customer_vendor.loc[vendor]):
customer_vendor.loc[vendor] = 0
# Calculate predicted rating about vendors which is not rated by inputted customer
else:
customer_vendor.loc[vendor] = cf_knn(customer_id, vendor, neighbor_size)
# Sort vendors by predictted rating
vendor_sort = customer_vendor.sort_values(ascending=False)[:n_items]
recom_vendors_temp = df1_train_for_anal.loc[vendor_sort.index]
recom_vendors_temp2 = recom_vendors_temp[['vendor_id', 'mean_rating', 'vendor_tag_name']]
return recom_vendors
# 顧客ID:'ZZV76GY'の顧客にレストランをおすすめ
cf_recom_vendor(customer_id='ZZV76GY', n_items=5, neighbor_size=30)
実行結果
k近傍法を使ったモデルの予測精度:0.3865892604982157
参考サイト
Recomm. system for Deli. App.(CF, MF, TF-IDF, D2V) | Kaggle
*1:np.array(y_true) - np.array(y_pred