Yelp Recommender Systems
참고 노트북 | Yelp Dataset: SurpriseMe Recommendation System
사용 라이브러리
1
2
3
4
5
6
7
8
9
10
11
12
13
import os
import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
Data Load
1
2
3
4
df_yelp_business = pd.read_json('./data//yelp_academic_dataset_business.json', lines=True)
df_yelp_business.fillna('NA', inplace=True)
df_yelp_business = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurants')]
print('Final Shape: ',df_yelp_business.shape)
1
Final Shape: (52268, 14)
1
2
3
4
5
6
7
8
9
10
df_yelp_review_iter = pd.read_json("./data/yelp_academic_dataset_review.json", chunksize=100000, lines=True)
df_yelp_review = pd.DataFrame()
i=0
for df in df_yelp_review_iter:
df = df[df['business_id'].isin(df_yelp_business['business_id'])]
df_yelp_review = pd.concat([df_yelp_review, df])
i=i+1
print(i)
if i==4: break
1
2
3
4
1
2
3
4
1
2
3
4
df_yelp_business = df_yelp_business[df_yelp_business['business_id'].isin(df_yelp_review['business_id'])]
print('Final businesses shape: ', df_yelp_business.shape)
print('Final review shape: ', df_yelp_review.shape)
1
2
Final businesses shape: (4937, 14)
Final review shape: (283029, 9)
Preprocessing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def clean_text(text):
## 구두점 제거
text = text.translate(string.punctuation)
## 소문자 변경 후 분리
text = text.lower().split()
## 불용어 제거
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops and len(w) >= 3]
text = " ".join(text)
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
return text
1
2
%%time
df_yelp_review['text'] = df_yelp_review['text'].apply(clean_text)
1
2
CPU times: total: 1min 9s
Wall time: 1min 9s
Top 100 Vocabularies
1
2
3
4
vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorized_reviews = vectorizer_reviews.fit_transform(df_yelp_review['text'])
vectorized_reviews.shape
1
(283029, 886)
1
' | '.join(vectorizer_reviews.get_feature_names_out()[:100])
1
'! | + | - | 00 | 1 | 10 | 12 | 15 | 2 | 20 | 3 | 30 | 4 | 5 | 50 | 6 | 7 | 8 | : | ; | a | able | about | absolutely | accommodating | across | actually | add | added | addition | afternoon | again | ago | all | almost | along | already | also | although | always | am | amazing | ambiance | american | amount | and | another | anyone | anything | anyway | anywhere | appetizer | appetizers | are | area | around | arrived | as | ask | asked | ate | atmosphere | attention | attentive | authentic | available | average | avocado | away | awesome | awful | back | bacon | bad | baked | bar | bartender | based | basically | bbq | be | beans | beautiful | beef | beer | beers | before | behind | believe | best | better | beyond | big | bill | birthday | bit | bite | black | bland | blue'
Top 100 Categoreis
1
2
3
4
vectorizer_categories = CountVectorizer(min_df = 1, max_df = 1., tokenizer = lambda x: x.split(', '))
vectorized_categories = vectorizer_categories.fit_transform(df_yelp_business['categories'])
vectorized_categories.shape
1
(4937, 387)
1
' | '.join(vectorizer_categories.get_feature_names_out()[:100])
1
"acai bowls | accessories | active life | adult entertainment | afghan | african | american (new) | american (traditional) | amusement parks | appliances & repair | arabic | arcades | argentine | armenian | art galleries | arts & crafts | arts & entertainment | asian fusion | austrian | auto detailing | auto glass services | auto repair | automotive | bagels | bakeries | banks & credit unions | bar crawl | barbeque | barbers | bars | bartenders | basque | battery stores | batting cages | beaches | beauty & spas | bed & breakfast | beer | beer bar | beer gardens | beer tours | beverage store | bistros | boat charters | boat tours | boating | body shops | books | bookstores | bowling | brasseries | brazilian | breakfast & brunch | breweries | brewpubs | british | bubble tea | buffets | building supplies | burgers | burmese | business consulting | butcher | cabaret | cafes | cafeteria | cajun/creole | calabrian | cambodian | canadian (new) | candy stores | cannabis dispensaries | cantonese | car dealers | car stereo installation | cards & stationery | caribbean | casinos | caterers | cheese shops | cheesesteaks | chicken shop | chicken wings | child care & day care | children's clothing | chinese | chiropractors | chocolatiers & shops | christmas trees | churches | cinema | club crawl | cocktail bars | coffee & tea | coffee roasteries | coffeeshops | colombian | comedy clubs | comfort food | community service/non-profit"
희소 행렬 생성
1
2
3
%%time
from scipy import sparse
businessxreview = sparse.csr_matrix(pd.get_dummies(df_yelp_review['business_id']).values)
1
2
CPU times: total: 14.6 s
Wall time: 14.7 s
1
2
3
print('restuarants x categories: \t', vectorized_categories.shape)
print('restuarants x reviews: \t\t' , businessxreview.shape)
print('reviews x words: \t\t', vectorized_reviews.shape)
1
2
3
restuarants x categories: (4937, 387)
restuarants x reviews: (283029, 4937)
reviews x words: (283029, 886)
리뷰와 평점이 좋은 다른 식당 추천
1
df_yelp_business.sample(5)
business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | attributes | categories | hours | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6273 | DptW6vZmrd7ttS0RCaWx2w | Xwrecks Restaurant & Lounge | 9303 50th Street NW | Edmonton | AB | T6B 2L5 | 53.530919 | -113.417837 | 2.0 | 7 | 0 | {'Alcohol': 'u'full_bar'', 'RestaurantsPriceRa... | Restaurants, Bars, Nightlife, American (Tradit... | {'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ... |
12352 | 4w6Z5v0uVt08oSBaA3342A | Wawa | 600 Cinnaminson Ave | Palmyra | NJ | 08065 | 39.998409 | -75.035118 | 3.5 | 5 | 1 | {'RestaurantsPriceRange2': '4', 'BusinessAccep... | Convenience Stores, Automotive, Coffee & Tea, ... | {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W... |
3136 | N44roXfLNkBdpINQDjEFOQ | Carisilo's Mexican Restaurant | 1978 Vandalia St | Collinsville | IL | 62234 | 38.695337 | -89.966691 | 4.0 | 65 | 1 | {'RestaurantsDelivery': 'False', 'Alcohol': ''... | Mexican, Restaurants | {'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'... |
9834 | -SFSt3FkjGfavnyMpHsZPA | Enjoi Sweets & Company | 4707 W Gandy Blvd, Ste 7 | Tampa | FL | 33611 | 27.893760 | -82.525167 | 4.5 | 9 | 0 | {'NoiseLevel': 'u'quiet'', 'BusinessAcceptsBit... | Desserts, Food, Cafes, Restaurants, Food Truck... | {'Thursday': '12:0-21:0', 'Friday': '12:0-21:0... |
1427 | jLaPtjlLfRSaoBWIcHcSQg | The Mad Crab | 8080 Olive Blvd | University City | MO | 63130 | 38.672734 | -90.345018 | 3.5 | 156 | 1 | {'Caters': 'False', 'Alcohol': 'u'beer_and_win... | Seafood, Cajun/Creole, American (New), Restaur... | {'Monday': '15:0-22:0', 'Tuesday': '15:0-22:0'... |
1
business_choose = '-SFSt3FkjGfavnyMpHsZPA' # Desserts, Food, Cafes, Restaurants ...
1
2
new_reviews = df_yelp_review.loc[df_yelp_review['business_id'] == business_choose, 'text']
print('\n'.join([r[:100] for r in new_reviews.tolist()]))
1
2
3
4
5
6
7
8
9
wow probably best cupcakes i have since moved tampa + + i stopped guys came flicks food trucks heard
pleasure experiencing enjoi sweets recent food truck rally work later day dessert truck best place e
delicious cupcakes review say much liked place went tried red velvet chocolate chip brownie fresh yu
one word delectable ! + + stumbled upon food truck which also storefront flicks food trucks past mon
tried cupcakes food truck family ordered following : + + chocolate chocolate delicious moist cake ch
unable contact month left facebook review told anything nice say keep myself understand things come
used enjoi sweets company event fantastic ! everything setting event food itself joi jon pleasure wo
tried italian mango drink super delicious got get enough !
enjoi sweets one favorite food trucks love design course delicious cupcakes catered events say serve
1
2
new_categories = df_yelp_business.loc[df_yelp_business['business_id'] == business_choose, 'categories']
new_categories.tolist()
1
['Desserts, Food, Cafes, Restaurants, Food Trucks, American (Traditional)']
유사도 계산
1
2
3
4
5
6
7
8
9
from scipy.spatial.distance import cdist
# find most similar reviews
dists1 = cdist(vectorizer_reviews.transform(new_reviews).todense().mean(axis=0),
vectorized_reviews.T.dot(businessxreview).T.todense(),
metric='correlation')
# find most similar categories
dists2 = cdist(vectorizer_categories.transform(new_categories).todense().mean(axis=0),
vectorized_categories.todense(),
metric='correlation')
1
2
3
4
dists_together = np.vstack([dists1.ravel(), dists2.ravel()]).T
dists = dists_together.mean(axis=1)
dists
1
2
array([0.54952985, 0.50191353, 0.56616524, ..., 0.69466944, 0.64917578,
0.4334572 ])
1
2
# 가장 유사한 10개의 레스토랑
closest = dists.argsort().ravel()[:10]
기준 레스토랑
1
df_yelp_business.loc[df_yelp_business['business_id']== business_choose, ['business_id', 'categories', 'name', 'stars']]
business_id | categories | name | stars | |
---|---|---|---|---|
9834 | -SFSt3FkjGfavnyMpHsZPA | Desserts, Food, Cafes, Restaurants, Food Truck... | Enjoi Sweets & Company | 4.5 |
추천된 레스토랑 목록
1
df_yelp_business.loc[df_yelp_business['business_id'].isin(df_yelp_business['business_id'].iloc[closest]), ['business_id', 'categories', 'name', 'stars']]
business_id | categories | name | stars | |
---|---|---|---|---|
742 | dD2p903p8lU0IgXT3OFluA | Breakfast & Brunch, Restaurants, Food, Cafes, ... | Edgehill Cafe | 3.5 |
2548 | dcpWZ6Yk_S0HqTlNBi8jiA | Food, Coffee & Tea, Restaurants, Desserts, Cafes | The Woodrack Cafe | 4.0 |
4710 | qLrTiIPDlnNX6FYTs29rmg | Restaurants, American (Traditional) | Buddy's Grill | 3.5 |
6720 | jVdYRED2iztNaNCoTAhVMA | Restaurants, Salad, Food, Desserts | Have A Greener Day | 5.0 |
8244 | iHTL6BPlaPK6xvOa5MIKaQ | American (Traditional), Restaurants, Food, Ame... | Essentially Fries | 4.0 |
9834 | -SFSt3FkjGfavnyMpHsZPA | Desserts, Food, Cafes, Restaurants, Food Truck... | Enjoi Sweets & Company | 4.5 |
10337 | hQcAPRwuYFPAbhbpeNPEgA | Bakeries, American (Traditional), Food, Restau... | Apple Farm Diner and Bakery | 2.5 |
11701 | tYCok-NtWvg8_k7woeB83w | Desserts, American (Traditional), Cafes, Resta... | Grand Lux Cafe | 3.5 |
11748 | newkruvn1rhEvueEc9y1Mw | Food, Restaurants, Desserts, Ice Cream & Froze... | Moo Moo Milk Bar | 3.5 |
12506 | 9dW3CVyvnTXdkXg2AOyBfw | Desserts, Coffee & Tea, Cafes, Donuts, Food, S... | Birds Nest Cafe | 4.5 |
Comments powered by Disqus.