Yelp Recommender Systems

참고 노트북 | Yelp Dataset: SurpriseMe Recommendation System

사용 라이브러리

  
import os
import re
import string

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer

Data Load

  
df_yelp_business = pd.read_json('./data//yelp_academic_dataset_business.json', lines=True)
df_yelp_business.fillna('NA', inplace=True)
df_yelp_business = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurants')]
print('Final Shape: ',df_yelp_business.shape)

Final Shape:  (52268, 14)

  
df_yelp_review_iter = pd.read_json("./data/yelp_academic_dataset_review.json", chunksize=100000, lines=True)

df_yelp_review = pd.DataFrame()
i=0
for df in df_yelp_review_iter:
    df = df[df['business_id'].isin(df_yelp_business['business_id'])]
    df_yelp_review = pd.concat([df_yelp_review, df])
    i=i+1
    print(i)
    if i==4: break

  
df_yelp_business = df_yelp_business[df_yelp_business['business_id'].isin(df_yelp_review['business_id'])]

print('Final businesses shape: ', df_yelp_business.shape)
print('Final review shape: ', df_yelp_review.shape)

Final businesses shape:  (4937, 14)
Final review shape:  (283029, 9)

Preprocessing

  
def clean_text(text):
    ## 구두점 제거
    text = text.translate(string.punctuation)
    
    ## 소문자 변경 후 분리
    text = text.lower().split()
    
    ## 불용어 제거
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

  
%%time
df_yelp_review['text'] = df_yelp_review['text'].apply(clean_text)

CPU times: total: 1min 9s
Wall time: 1min 9s

Top 100 Vocabularies

  
vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorized_reviews = vectorizer_reviews.fit_transform(df_yelp_review['text'])

vectorized_reviews.shape

(283029, 886)

  
' | '.join(vectorizer_reviews.get_feature_names_out()[:100])

'! | + | - | 00 | 1 | 10 | 12 | 15 | 2 | 20 | 3 | 30 | 4 | 5 | 50 | 6 | 7 | 8 | : | ; | a | able | about | absolutely | accommodating | across | actually | add | added | addition | afternoon | again | ago | all | almost | along | already | also | although | always | am | amazing | ambiance | american | amount | and | another | anyone | anything | anyway | anywhere | appetizer | appetizers | are | area | around | arrived | as | ask | asked | ate | atmosphere | attention | attentive | authentic | available | average | avocado | away | awesome | awful | back | bacon | bad | baked | bar | bartender | based | basically | bbq | be | beans | beautiful | beef | beer | beers | before | behind | believe | best | better | beyond | big | bill | birthday | bit | bite | black | bland | blue'

Top 100 Categoreis

  
vectorizer_categories = CountVectorizer(min_df = 1, max_df = 1., tokenizer = lambda x: x.split(', '))
vectorized_categories = vectorizer_categories.fit_transform(df_yelp_business['categories'])

vectorized_categories.shape

(4937, 387)

  
' | '.join(vectorizer_categories.get_feature_names_out()[:100])

"acai bowls | accessories | active life | adult entertainment | afghan | african | american (new) | american (traditional) | amusement parks | appliances & repair | arabic | arcades | argentine | armenian | art galleries | arts & crafts | arts & entertainment | asian fusion | austrian | auto detailing | auto glass services | auto repair | automotive | bagels | bakeries | banks & credit unions | bar crawl | barbeque | barbers | bars | bartenders | basque | battery stores | batting cages | beaches | beauty & spas | bed & breakfast | beer | beer bar | beer gardens | beer tours | beverage store | bistros | boat charters | boat tours | boating | body shops | books | bookstores | bowling | brasseries | brazilian | breakfast & brunch | breweries | brewpubs | british | bubble tea | buffets | building supplies | burgers | burmese | business consulting | butcher | cabaret | cafes | cafeteria | cajun/creole | calabrian | cambodian | canadian (new) | candy stores | cannabis dispensaries | cantonese | car dealers | car stereo installation | cards & stationery | caribbean | casinos | caterers | cheese shops | cheesesteaks | chicken shop | chicken wings | child care & day care | children's clothing | chinese | chiropractors | chocolatiers & shops | christmas trees | churches | cinema | club crawl | cocktail bars | coffee & tea | coffee roasteries | coffeeshops | colombian | comedy clubs | comfort food | community service/non-profit"

희소 행렬 생성

  
%%time
from scipy import sparse
businessxreview = sparse.csr_matrix(pd.get_dummies(df_yelp_review['business_id']).values)

CPU times: total: 14.6 s
Wall time: 14.7 s

  
print('restuarants x categories: \t', vectorized_categories.shape) 
print('restuarants x reviews: \t\t' , businessxreview.shape) 
print('reviews x words: \t\t', vectorized_reviews.shape)

restuarants x categories: 	 (4937, 387)
restuarants x reviews: 		 (283029, 4937)
reviews x words: 		 (283029, 886)

리뷰와 평점이 좋은 다른 식당 추천

  
df_yelp_business.sample(5)

	business_id	name	address	city	state	postal_code	latitude	longitude	stars	review_count	is_open	attributes	categories	hours
6273	DptW6vZmrd7ttS0RCaWx2w	Xwrecks Restaurant & Lounge	9303 50th Street NW	Edmonton	AB	T6B 2L5	53.530919	-113.417837	2.0	7	0	{'Alcohol': 'u'full_bar'', 'RestaurantsPriceRa...	Restaurants, Bars, Nightlife, American (Tradit...	{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...
12352	4w6Z5v0uVt08oSBaA3342A	Wawa	600 Cinnaminson Ave	Palmyra	NJ	08065	39.998409	-75.035118	3.5	5	1	{'RestaurantsPriceRange2': '4', 'BusinessAccep...	Convenience Stores, Automotive, Coffee & Tea, ...	{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...
3136	N44roXfLNkBdpINQDjEFOQ	Carisilo's Mexican Restaurant	1978 Vandalia St	Collinsville	IL	62234	38.695337	-89.966691	4.0	65	1	{'RestaurantsDelivery': 'False', 'Alcohol': ''...	Mexican, Restaurants	{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...
9834	-SFSt3FkjGfavnyMpHsZPA	Enjoi Sweets & Company	4707 W Gandy Blvd, Ste 7	Tampa	FL	33611	27.893760	-82.525167	4.5	9	0	{'NoiseLevel': 'u'quiet'', 'BusinessAcceptsBit...	Desserts, Food, Cafes, Restaurants, Food Truck...	{'Thursday': '12:0-21:0', 'Friday': '12:0-21:0...
1427	jLaPtjlLfRSaoBWIcHcSQg	The Mad Crab	8080 Olive Blvd	University City	MO	63130	38.672734	-90.345018	3.5	156	1	{'Caters': 'False', 'Alcohol': 'u'beer_and_win...	Seafood, Cajun/Creole, American (New), Restaur...	{'Monday': '15:0-22:0', 'Tuesday': '15:0-22:0'...

  
business_choose = '-SFSt3FkjGfavnyMpHsZPA' # Desserts, Food, Cafes, Restaurants ...

  
new_reviews = df_yelp_review.loc[df_yelp_review['business_id'] == business_choose, 'text']
print('\n'.join([r[:100] for r in new_reviews.tolist()]))

wow probably best cupcakes i have since moved tampa + + i stopped guys came flicks food trucks heard
pleasure experiencing enjoi sweets recent food truck rally work later day dessert truck best place e
delicious cupcakes review say much liked place went tried red velvet chocolate chip brownie fresh yu
one word delectable ! + + stumbled upon food truck which also storefront flicks food trucks past mon
tried cupcakes food truck family ordered following : + + chocolate chocolate delicious moist cake ch
unable contact month left facebook review told anything nice say keep myself understand things come 
used enjoi sweets company event fantastic ! everything setting event food itself joi jon pleasure wo
tried italian mango drink super delicious got get enough ! 
enjoi sweets one favorite food trucks love design course delicious cupcakes catered events say serve

  
new_categories = df_yelp_business.loc[df_yelp_business['business_id'] == business_choose, 'categories']
new_categories.tolist()

['Desserts, Food, Cafes, Restaurants, Food Trucks, American (Traditional)']

유사도 계산

  
from scipy.spatial.distance import cdist
# find most similar reviews
dists1 = cdist(vectorizer_reviews.transform(new_reviews).todense().mean(axis=0), 
              vectorized_reviews.T.dot(businessxreview).T.todense(), 
               metric='correlation')
# find most similar categories
dists2 = cdist(vectorizer_categories.transform(new_categories).todense().mean(axis=0), 
              vectorized_categories.todense(), 
               metric='correlation')

  
dists_together = np.vstack([dists1.ravel(), dists2.ravel()]).T

dists = dists_together.mean(axis=1)
dists

array([0.54952985, 0.50191353, 0.56616524, ..., 0.69466944, 0.64917578,
       0.4334572 ])

  
# 가장 유사한 10개의 레스토랑
closest = dists.argsort().ravel()[:10]

기준 레스토랑

  
df_yelp_business.loc[df_yelp_business['business_id']== business_choose, ['business_id', 'categories', 'name', 'stars']]

	business_id	categories	name	stars
9834	-SFSt3FkjGfavnyMpHsZPA	Desserts, Food, Cafes, Restaurants, Food Truck...	Enjoi Sweets & Company	4.5

	business_id	categories	name	stars
742	dD2p903p8lU0IgXT3OFluA	Breakfast & Brunch, Restaurants, Food, Cafes, ...	Edgehill Cafe	3.5
2548	dcpWZ6Yk_S0HqTlNBi8jiA	Food, Coffee & Tea, Restaurants, Desserts, Cafes	The Woodrack Cafe	4.0
4710	qLrTiIPDlnNX6FYTs29rmg	Restaurants, American (Traditional)	Buddy's Grill	3.5
6720	jVdYRED2iztNaNCoTAhVMA	Restaurants, Salad, Food, Desserts	Have A Greener Day	5.0
8244	iHTL6BPlaPK6xvOa5MIKaQ	American (Traditional), Restaurants, Food, Ame...	Essentially Fries	4.0
9834	-SFSt3FkjGfavnyMpHsZPA	Desserts, Food, Cafes, Restaurants, Food Truck...	Enjoi Sweets & Company	4.5
10337	hQcAPRwuYFPAbhbpeNPEgA	Bakeries, American (Traditional), Food, Restau...	Apple Farm Diner and Bakery	2.5
11701	tYCok-NtWvg8_k7woeB83w	Desserts, American (Traditional), Cafes, Resta...	Grand Lux Cafe	3.5
11748	newkruvn1rhEvueEc9y1Mw	Food, Restaurants, Desserts, Ice Cream & Froze...	Moo Moo Milk Bar	3.5
12506	9dW3CVyvnTXdkXg2AOyBfw	Desserts, Coffee & Tea, Cafes, Donuts, Food, S...	Birds Nest Cafe	4.5

Yelp 데이터셋과 텍스트 유사도를 이용한 추천 시스템

Yelp Recommender Systems

사용 라이브러리

Data Load

Preprocessing

Top 100 Vocabularies

Top 100 Categoreis

희소 행렬 생성

리뷰와 평점이 좋은 다른 식당 추천

유사도 계산

기준 레스토랑

추천된 레스토랑 목록

Yelp 데이터셋과 텍스트 유사도를 이용한 추천 시스템

Yelp Recommender Systems

사용 라이브러리

Data Load

Preprocessing

Top 100 Vocabularies

Top 100 Categoreis

희소 행렬 생성

리뷰와 평점이 좋은 다른 식당 추천

유사도 계산

기준 레스토랑

추천된 레스토랑 목록

Further Reading

DACON 쇼핑몰 지점별 매출액 예측 경진대회 1 EDA/Preprocessing

DACON 쇼핑몰 지점별 매출액 예측 경진대회 2 모델링 (회귀 모델)

차원 축소(Dimension Reduction) 정리