機械学習で好みのレストランをおすすめする

kaggleの'Restaurant Recommendation Challenge’で、レコメンデーションの実装をしてみた。
今回の目的

機械学習・レコメンデーションの実装
アウトプットの活用定義(5W1H)

who　飲食店サイトが

when　経営会議で
where　ー
why　サービス利用者のUI向上するために
what　おすすめのレストランを
how　ユーザーAにはレストランXをおすすめするのが良いという感じで
実装手順

ライブラリ、データのインポート
EDA
データプロセッシング
k近傍法による学習モデルの構築
実装コード

# ライブラリのインポート
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam, Adamax
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

# データの読み込み
data1=pd.read_csv("/content/drive/MyDrive/kaggle/Restaurant Recommendation Challenge/train_full.csv")
data1.head()

# データ型を把握
data1.info()

# 主要な列のみを抽出
main_df=data1[['gender','location_type','language','OpeningTime','city_id','vendor_rating']]
msno.matrix(main_df)

# gender列の比率と、欠損値の割合を計算
print(data1['gender'].value_counts()) 
gender_null = np.count_nonzero(data1['gender'].isnull())
print(gender_null)
gender_null/data1.shape[0]
# 男女比を可視化
sns.countplot('gender',data=data1)

#location type列の比率と、欠損値の割合を計算
print(data1['location_type'].value_counts())
location_null = np.count_nonzero(data1['location_type'].isnull())
print(location_null)
print("null Ratio : ", location_null/data1.shape[0]) 
# 飲食場所の比率を可視化
sns.countplot('location_type',data=data1)

# 男女ごとの、飲食場所の比率を可視化
sns.countplot(data1['gender'],hue=data1['location_type'])
# 分かること
# 男性は家で食べることが多い一方で、女性はほとんど差がない

# language列の値、欠損値の比率を計算
print(data1['language'].value_counts())
null = np.count_nonzero(data1['language'].isnull())
print(null)

null/data1.shape[0] 

# OpeningTime列の値、欠損値の比率を計算
null = np.count_nonzero(data1['OpeningTime'].isnull())
print(null)

null/data1.shape[0]

# 店の評価の値、欠損値の比率を表示
print(data1['vendor_rating'].value_counts())
null = np.count_nonzero(data1['vendor_rating'].isnull())
print(null)
null/data1.shape[0]
# 店の評価のバラつきを可視化
sns.countplot('vendor_rating',data=data1)


# orders.csvの中身を確認
data2 = pd.read_csv("/content/drive/MyDrive/kaggle/Restaurant Recommendation Challenge/orders.csv")
data2.head()
data2.info()
# 分かること
# orders.csvは、レストランの注文履歴

# vendor_rating列の値、欠損値の比率を表示
print(data2['vendor_rating'].value_counts())
null = np.count_nonzero(data2['vendor_rating'].isnull())
print(null)
null/data2.shape[0] 
# 可視化
sns.countplot('vendor_rating',data=data2)

# レストランの数を表示
print(data2['vendor_id'].value_counts())
# 顧客の数を表示
print(data2['customer_id'].value_counts())


# データプロセッシング

# trainデータから新たにdatasetをつくる
dataset1 = data1[['customer_id','gender','location_type','id','OpeningTime','language','vendor_rating','serving_distance','vendor_tag_name','delivery_charge']]
dataset1.rename(columns = {"vendor_rating": "mean_rating"}, inplace = True)

# 顧客IDと店舗IDを統合した列を作る
cols = ['customer_id', 'id']
dataset1['all'] = dataset1[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dataset1.head()
# 重複行を削除
dataset1.drop_duplicates(['all'],inplace=True)

# 注文履歴データから新たにdatasetをつくる
dataset2 = data2[['akeed_order_id','customer_id','vendor_id', 'item_count', 'grand_total', 'vendor_rating']][:]
dataset2.rename(columns = {"vendor_id": "id"}, inplace = True)

# 顧客IDと店舗IDを統合した列を作る
cols = ['customer_id', 'id']
dataset2['all'] = dataset2[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
dataset2.head()

# 新たに作ったDatasetの大きさを確認
print(dataset1.shape)
print(dataset2.shape)

# ２つのDatasetを内部結合
df1=pd.merge(dataset1,dataset2,on='all',how='inner')
df1.head()
df1.shape

# ちょっと加工
df1.rename(columns = {"customer_id_x": "customer_id", "id_x": "vendor_id"}, inplace = True)
df1.drop(['customer_id_y','id_y'],axis=1,inplace=True)

# 顧客ID,店舗ID,店舗のタグの3列から新たなDataframeを作る
df2=dataset1[['customer_id','id','vendor_tag_name']]
df2.rename(columns={'id':'vendor_id'},inplace=True)
df2.head()

# Data Cleaning

# 対象の列に含まれている欠損値の割合を計算
cols=[ 'serving_distance', 'delivery_charge','item_count', 'grand_total', 'vendor_rating']
def null_check(x):
 # print(df1_train[x].value_counts())
  null = np.count_nonzero(df1[x].isnull())
  print(null)

  return null/df1.shape[0] 

for i in cols:
  print(i,'null ratio :', null_check(i))

# 言語列を削除
df1.drop(['language'],axis=1,inplace=True)
# 性別列が欠損値の行を削除
df1 = df1[df1['gender'].notnull()].reset_index(drop=True)

# 性別列を量的変数に変換
sex=pd.get_dummies(df1["gender"], columns = ['gender'],prefix="sex",drop_first=True)
df1=pd.concat([df1,sex],axis=1)
df1.drop(['gender'],axis=1,inplace=True)

# 店の評価列の名前を変更
df1.rename(columns={'vendor_rating': 'rating'}, inplace=True)
print(df1.shape)
df1.head()

# 後のモデルに使うためのDataframeを作成
df1_train_for_anal = df1[:]

# Openingtime列を開店時間、閉店時間に分けて新たに列を作る
train_contents['OpeningTime'].fillna('-',inplace=True) 

time_split= train_contents.OpeningTime.str.split('-')
open=time_split.str.get(0)
close=time_split.str.get(1)

train_contents['Open']=open
train_contents['Close']=close

train_contents['Open'].fillna('',inplace=True) 
train_contents['Close'].fillna('',inplace=True)

print(train_contents['Open'].unique())
print(train_contents['Close'].unique())

# 開店時間を朝昼夜で分ける
def morning_func(x) :
  if x == "" :
    return None
  else :
    x1 = int(x[:2].replace(":", "").replace("a", ""))
    x2 = x[-2:]
    if (x1>=7 and x1 <= 10) and x2 == ("AM" or "am"): 
      return 1
    elif x1 <=10 and len(x) <= 2 :
      return 1
    else :
      return 0

def afternoon_func(x) :
  if x == "" :
    return None
  else :
    x1 = int(x[:2].replace(":", "").replace("a", ""))
    x2 = x[-2:]
    if x1 <= 1 and x2 == "PM": 
      return 1
    elif x1 == 12 and x2 == "PM":
      return 1
    elif x2 == ("AM" or "am"):
      return 1
    elif x1 <=10 and len(x) <= 2 :
      return 1
    else :
      return 0

def evening_func(x) :
  if x == "" :
    return None
  else :
    x1 = int(x[:2].replace(":",""))
    x2 = x[-2:]
    if (x1 >= 6 and x2 == "PM") or x2 == ("Am" or "am") :
      return 1
    elif x1 >= 22 and len(x)<=2:
      return 1      
    else :
      return 0

train_contents["morning"] = train_contents["Open"].apply(morning_func)
train_contents["afternoon"] = train_contents["Open"].apply(afternoon_func)
train_contents["evening"] = train_contents["Close"].apply(evening_func)

train_contents.head()

# 'vendor_tag_name'列の欠損値を表示
null = np.count_nonzero(train_contents['vendor_tag_name'].isnull())
print(null)
print(null/train_contents.shape[0]) #1%
# 欠損値を含む行を削除
train_contents= train_contents[train_contents['vendor_tag_name'].notnull()].reset_index(drop=True)
null = np.count_nonzero(train_contents['vendor_tag_name'].isnull())
print(null)

# 提供メニューをリストに格納
train_contents['vendor_tag_name']=train_contents['vendor_tag_name'].apply(lambda x:x.lower())

train_contents['vendor_tag']= train_contents['vendor_tag_name'].str.split(',')
train_contents['vendor_tag'].head()

# 朝昼夜列の値を作成

# morning列
def breakfast1(tag,x2):
  if any('breakfast' in i for i in tag) and np.isnan(x2)==True :
    return 1
  # elif any('breakfast' in i for i in tag) and int(x2)==0 :  
  #   return 1
  else:
    return x2

train_contents['mor2']=train_contents.apply(lambda x:  breakfast1(x['vendor_tag'],x['morning']),axis=1)

# afternoon列
for i in range(len(train_contents['afternoon'])):
  if (np.isnan(train_contents['morning'][i])==True) and (train_contents['mor2'][i]==1.0) :
    train_contents['afternoon'][i]=1
  else:
    pass

#evening列
for i in range(len(train_contents['evening'])):
  if (np.isnan(train_contents['morning'][i])==True) and (train_contents['mor2'][i]==1.0) :
    train_contents['evening'][i]=0
  else:
    pass

# 欠損値を確認
null = np.count_nonzero(train_contents['mor2'].isnull())
print(null)
print(null/train_contents.shape[0])

train_contents= train_contents[train_contents['mor2'].notnull()].reset_index(drop=True)
train_contents.drop(['morning'],axis=1,inplace=True)
train_contents.rename(columns={'mor2':'morning'},inplace=True)

train_contents.head()


# Collaborative Filtering


# 必要な列を取り出す
cus_ven_ratings = df1_train_for_anal[['customer_id', 'vendor_id', 'rating']]
cus_ven_ratings

# 欠損値、０を除く店の評価で配列を作る
ratings_not_none = []

for i in range(0, cus_ven_ratings.shape[0]-1) :
  if pd.isnull(cus_ven_ratings.iloc[i][2]) == False and cus_ven_ratings.iloc[i][2] != 0 :
    ratings_not_none.append(cus_ven_ratings.iloc[i][2])
    
valid_rating_mean = np.mean(np.array(ratings_not_none))

# 欠損値を店の評価列の平均で埋める
def rating_missing_func(x) :
  if pd.isnull(x) == True :
    return valid_rating_mean
  elif x == 0 :
    return valid_rating_mean
  else :
    return x

cus_ven_ratings["rating2"] = cus_ven_ratings["rating"].apply(rating_missing_func)
cus_ven_ratings

# dataframeの列名を変更
cus_ven_ratings = cus_ven_ratings[['customer_id', 'vendor_id', 'rating2']]
cus_ven_ratings.rename(columns={'rating2':'rating', 1:'customer_id_num'}, inplace=True)
cus_ven_ratings

# 顧客ID、てんぽID でグループ化して、評価の平均値を計算
cus_ven_ratings_mean = cus_ven_ratings.groupby(['customer_id', 'vendor_id']).mean()
cus_ven_ratings_mean
# インデックスを連番にした新たなDataframeを作成
df_cus_ven_ratings_mean = cus_ven_ratings_mean.reset_index()
df_cus_ven_ratings_mean

# モデル作成のために、ピボットテーブルに変換
rating_full_matrix = df_cus_ven_ratings_mean.pivot(index='customer_id', columns='vendor_id', values='rating')
rating_full_matrix


# sklearnライブラリのcosine_similarityモジュールを使って、顧客間の類似度を計算
from sklearn.metrics.pairwise import cosine_similarity
rating_matrix_dummy = rating_full_matrix.copy().fillna(0)

customer_similarity = cosine_similarity(rating_matrix_dummy, rating_matrix_dummy)

customer_similarity = pd.DataFrame(customer_similarity, index = rating_full_matrix.index, columns=rating_full_matrix.index)
customer_similarity

# RMSEで学習モデルを評価する
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean*1**2))

def knn_score(model, neigbor_size=0) :
  id_pairs = zip(df_cus_ven_ratings_mean['customer_id'], df_cus_ven_ratings_mean['vendor_id'])
  y_pred = np.array([model(customer, vendor, neigbor_size) for (customer, vendor) in id_pairs])
  y_true = np.array(df_cus_ven_ratings_mean['rating'])
  return RMSE(y_true, y_pred)

# k近傍法を使って店の評価を分類する
def cf_knn(customer_id, vendor_id, neighbor_size=0):
    if vendor_id in rating_full_matrix:
        # Similarity of inputted customer and other customer
        sim_scores = customer_similarity[customer_id].copy()
        # Ratings by all customers for inputted vendor(restaurant)
        vendor_ratings = rating_full_matrix[vendor_id].copy()
        # Index of customers who are not rate inputted vendor
        none_rating_idx = vendor_ratings[vendor_ratings.isnull()].index
        # Exception rating(null) which of customers who are not rate inputted vendor
        vendor_ratings = vendor_ratings.drop(none_rating_idx)
        # Exception similarity which of customers who are not rate inputted vendor
        sim_scores = sim_scores.drop(none_rating_idx)
   
        # Case that neighbor size is not specified
        if neighbor_size == 0:          
            # Weighted mean of ratings by customers who rate inputted vendor
            mean_rating = np.dot(sim_scores, vendor_ratings) / sim_scores.sum()
            # Case that neighbor size is specified
        else:                       
            # Case that 2 or more people rate inputted vendor
            if len(sim_scores) > 1: 
                # Minimum value among inputted neighbor size and number of customers who rate inputted vendor
                neighbor_size = min(neighbor_size, len(sim_scores))
                # transpose to Numpy array for using argsort
                sim_scores = np.array(sim_scores)
                vendor_ratings = np.array(vendor_ratings)
                # Sorting similarity
                customer_idx = np.argsort(sim_scores)
                # Similarity as much as neighbor size
                sim_scores = sim_scores[customer_idx][-neighbor_size:]
                # Ratings as much as neighbor size
                vendor_ratings = vendor_ratings[customer_idx][-neighbor_size:]
                # Caculate final predicted ranting
                mean_rating = np.dot(sim_scores, vendor_ratings) / sim_scores.sum()
            else:
                # Substitute to valid mean in other case
                mean_rating = valid_rating_mean
    else:
        # Substitute to valid mean in other case
        mean_rating = valid_rating_mean
    return mean_rating 

knn_score(cf_knn, neigbor_size=20)

# Function which present recommendation list for certain customer by CF

# 特定の顧客にレストランをおすすめするモデルを作成
def cf_recom_vendor(customer_id, n_items, neighbor_size=0):
    # Vendors which rated by inputted customer
    customer_vendor = rating_full_matrix.loc[customer_id].copy()
    
    for vendor in rating_full_matrix:
        # Excepton vendors which already rated by inputted customer
        if pd.notnull(customer_vendor.loc[vendor]):
            customer_vendor.loc[vendor] = 0
        # Calculate predicted rating about vendors which is not rated by inputted customer
        else:
            customer_vendor.loc[vendor] = cf_knn(customer_id, vendor, neighbor_size)
    
    # Sort vendors by predictted rating
    vendor_sort = customer_vendor.sort_values(ascending=False)[:n_items]
    recom_vendors_temp = df1_train_for_anal.loc[vendor_sort.index]
    recom_vendors_temp2 = recom_vendors_temp[['vendor_id', 'mean_rating', 'vendor_tag_name']]
    recom_vendors = recom_vendors_temp2.reset_index(drop=True)
    return recom_vendors

# 顧客ID：'ZZV76GY'の顧客にレストランをおすすめ
cf_recom_vendor(customer_id='ZZV76GY', n_items=5, neighbor_size=30)
実行結果

k近傍法を使ったモデルの予測精度：0.3865892604982157
参考サイト

Recomm. system for Deli. App.(CF, MF, TF-IDF, D2V) | Kaggle
*1:np.array(y_true) - np.array(y_pred
ぺーぱーの日々

上機嫌でいること、夢中でいることを目標に、今日も色んなことに手を出します。

機械学習で好みのレストランをおすすめする

今回の目的

アウトプットの活用定義(5W1H)

who　飲食店サイトが

実装手順

実装コード

実行結果

参考サイト

今回の目的

アウトプットの活用定義(5W1H)

who 飲食店サイトが

実装手順

実装コード

実行結果

参考サイト

who　飲食店サイトが