Home Yelp 데이터셋과 텍스트 유사도를 이용한 추천 시스템
Post
Cancel

Yelp 데이터셋과 텍스트 유사도를 이용한 추천 시스템

Yelp Recommender Systems

참고 노트북 | Yelp Dataset: SurpriseMe Recommendation System

사용 라이브러리

1
2
3
4
5
6
7
8
9
10
11
12
13
import os
import re
import string

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer

Data Load

1
2
3
4
df_yelp_business = pd.read_json('./data//yelp_academic_dataset_business.json', lines=True)
df_yelp_business.fillna('NA', inplace=True)
df_yelp_business = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurants')]
print('Final Shape: ',df_yelp_business.shape)
1
Final Shape:  (52268, 14)
1
2
3
4
5
6
7
8
9
10
df_yelp_review_iter = pd.read_json("./data/yelp_academic_dataset_review.json", chunksize=100000, lines=True)

df_yelp_review = pd.DataFrame()
i=0
for df in df_yelp_review_iter:
    df = df[df['business_id'].isin(df_yelp_business['business_id'])]
    df_yelp_review = pd.concat([df_yelp_review, df])
    i=i+1
    print(i)
    if i==4: break
1
2
3
4
1
2
3
4
1
2
3
4
df_yelp_business = df_yelp_business[df_yelp_business['business_id'].isin(df_yelp_review['business_id'])]

print('Final businesses shape: ', df_yelp_business.shape)
print('Final review shape: ', df_yelp_review.shape)
1
2
Final businesses shape:  (4937, 14)
Final review shape:  (283029, 9)

Preprocessing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def clean_text(text):
    ## 구두점 제거
    text = text.translate(string.punctuation)
    
    ## 소문자 변경 후 분리
    text = text.lower().split()
    
    ## 불용어 제거
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text
1
2
%%time
df_yelp_review['text'] = df_yelp_review['text'].apply(clean_text)
1
2
CPU times: total: 1min 9s
Wall time: 1min 9s

Top 100 Vocabularies

1
2
3
4
vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorized_reviews = vectorizer_reviews.fit_transform(df_yelp_review['text'])

vectorized_reviews.shape
1
(283029, 886)
1
' | '.join(vectorizer_reviews.get_feature_names_out()[:100])
1
'! | + | - | 00 | 1 | 10 | 12 | 15 | 2 | 20 | 3 | 30 | 4 | 5 | 50 | 6 | 7 | 8 | : | ; | a | able | about | absolutely | accommodating | across | actually | add | added | addition | afternoon | again | ago | all | almost | along | already | also | although | always | am | amazing | ambiance | american | amount | and | another | anyone | anything | anyway | anywhere | appetizer | appetizers | are | area | around | arrived | as | ask | asked | ate | atmosphere | attention | attentive | authentic | available | average | avocado | away | awesome | awful | back | bacon | bad | baked | bar | bartender | based | basically | bbq | be | beans | beautiful | beef | beer | beers | before | behind | believe | best | better | beyond | big | bill | birthday | bit | bite | black | bland | blue'

Top 100 Categoreis

1
2
3
4
vectorizer_categories = CountVectorizer(min_df = 1, max_df = 1., tokenizer = lambda x: x.split(', '))
vectorized_categories = vectorizer_categories.fit_transform(df_yelp_business['categories'])

vectorized_categories.shape
1
(4937, 387)
1
' | '.join(vectorizer_categories.get_feature_names_out()[:100])
1
"acai bowls | accessories | active life | adult entertainment | afghan | african | american (new) | american (traditional) | amusement parks | appliances & repair | arabic | arcades | argentine | armenian | art galleries | arts & crafts | arts & entertainment | asian fusion | austrian | auto detailing | auto glass services | auto repair | automotive | bagels | bakeries | banks & credit unions | bar crawl | barbeque | barbers | bars | bartenders | basque | battery stores | batting cages | beaches | beauty & spas | bed & breakfast | beer | beer bar | beer gardens | beer tours | beverage store | bistros | boat charters | boat tours | boating | body shops | books | bookstores | bowling | brasseries | brazilian | breakfast & brunch | breweries | brewpubs | british | bubble tea | buffets | building supplies | burgers | burmese | business consulting | butcher | cabaret | cafes | cafeteria | cajun/creole | calabrian | cambodian | canadian (new) | candy stores | cannabis dispensaries | cantonese | car dealers | car stereo installation | cards & stationery | caribbean | casinos | caterers | cheese shops | cheesesteaks | chicken shop | chicken wings | child care & day care | children's clothing | chinese | chiropractors | chocolatiers & shops | christmas trees | churches | cinema | club crawl | cocktail bars | coffee & tea | coffee roasteries | coffeeshops | colombian | comedy clubs | comfort food | community service/non-profit"

희소 행렬 생성

1
2
3
%%time
from scipy import sparse
businessxreview = sparse.csr_matrix(pd.get_dummies(df_yelp_review['business_id']).values)
1
2
CPU times: total: 14.6 s
Wall time: 14.7 s
1
2
3
print('restuarants x categories: \t', vectorized_categories.shape) 
print('restuarants x reviews: \t\t' , businessxreview.shape) 
print('reviews x words: \t\t', vectorized_reviews.shape)
1
2
3
restuarants x categories: 	 (4937, 387)
restuarants x reviews: 		 (283029, 4937)
reviews x words: 		 (283029, 886)

리뷰와 평점이 좋은 다른 식당 추천

1
df_yelp_business.sample(5)
business_idnameaddresscitystatepostal_codelatitudelongitudestarsreview_countis_openattributescategorieshours
6273DptW6vZmrd7ttS0RCaWx2wXwrecks Restaurant & Lounge9303 50th Street NWEdmontonABT6B 2L553.530919-113.4178372.070{'Alcohol': 'u'full_bar'', 'RestaurantsPriceRa...Restaurants, Bars, Nightlife, American (Tradit...{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ...
123524w6Z5v0uVt08oSBaA3342AWawa600 Cinnaminson AvePalmyraNJ0806539.998409-75.0351183.551{'RestaurantsPriceRange2': '4', 'BusinessAccep...Convenience Stores, Automotive, Coffee & Tea, ...{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...
3136N44roXfLNkBdpINQDjEFOQCarisilo's Mexican Restaurant1978 Vandalia StCollinsvilleIL6223438.695337-89.9666914.0651{'RestaurantsDelivery': 'False', 'Alcohol': ''...Mexican, Restaurants{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...
9834-SFSt3FkjGfavnyMpHsZPAEnjoi Sweets & Company4707 W Gandy Blvd, Ste 7TampaFL3361127.893760-82.5251674.590{'NoiseLevel': 'u'quiet'', 'BusinessAcceptsBit...Desserts, Food, Cafes, Restaurants, Food Truck...{'Thursday': '12:0-21:0', 'Friday': '12:0-21:0...
1427jLaPtjlLfRSaoBWIcHcSQgThe Mad Crab8080 Olive BlvdUniversity CityMO6313038.672734-90.3450183.51561{'Caters': 'False', 'Alcohol': 'u'beer_and_win...Seafood, Cajun/Creole, American (New), Restaur...{'Monday': '15:0-22:0', 'Tuesday': '15:0-22:0'...
1
business_choose = '-SFSt3FkjGfavnyMpHsZPA' # Desserts, Food, Cafes, Restaurants ...
1
2
new_reviews = df_yelp_review.loc[df_yelp_review['business_id'] == business_choose, 'text']
print('\n'.join([r[:100] for r in new_reviews.tolist()]))
1
2
3
4
5
6
7
8
9
wow probably best cupcakes i have since moved tampa + + i stopped guys came flicks food trucks heard
pleasure experiencing enjoi sweets recent food truck rally work later day dessert truck best place e
delicious cupcakes review say much liked place went tried red velvet chocolate chip brownie fresh yu
one word delectable ! + + stumbled upon food truck which also storefront flicks food trucks past mon
tried cupcakes food truck family ordered following : + + chocolate chocolate delicious moist cake ch
unable contact month left facebook review told anything nice say keep myself understand things come 
used enjoi sweets company event fantastic ! everything setting event food itself joi jon pleasure wo
tried italian mango drink super delicious got get enough ! 
enjoi sweets one favorite food trucks love design course delicious cupcakes catered events say serve
1
2
new_categories = df_yelp_business.loc[df_yelp_business['business_id'] == business_choose, 'categories']
new_categories.tolist()
1
['Desserts, Food, Cafes, Restaurants, Food Trucks, American (Traditional)']

유사도 계산

1
2
3
4
5
6
7
8
9
from scipy.spatial.distance import cdist
# find most similar reviews
dists1 = cdist(vectorizer_reviews.transform(new_reviews).todense().mean(axis=0), 
              vectorized_reviews.T.dot(businessxreview).T.todense(), 
               metric='correlation')
# find most similar categories
dists2 = cdist(vectorizer_categories.transform(new_categories).todense().mean(axis=0), 
              vectorized_categories.todense(), 
               metric='correlation')
1
2
3
4
dists_together = np.vstack([dists1.ravel(), dists2.ravel()]).T

dists = dists_together.mean(axis=1)
dists
1
2
array([0.54952985, 0.50191353, 0.56616524, ..., 0.69466944, 0.64917578,
       0.4334572 ])
1
2
# 가장 유사한 10개의 레스토랑
closest = dists.argsort().ravel()[:10]

기준 레스토랑

1
df_yelp_business.loc[df_yelp_business['business_id']== business_choose, ['business_id', 'categories', 'name', 'stars']]
business_idcategoriesnamestars
9834-SFSt3FkjGfavnyMpHsZPADesserts, Food, Cafes, Restaurants, Food Truck...Enjoi Sweets & Company4.5

추천된 레스토랑 목록

1
df_yelp_business.loc[df_yelp_business['business_id'].isin(df_yelp_business['business_id'].iloc[closest]), ['business_id', 'categories', 'name', 'stars']]
business_idcategoriesnamestars
742dD2p903p8lU0IgXT3OFluABreakfast & Brunch, Restaurants, Food, Cafes, ...Edgehill Cafe3.5
2548dcpWZ6Yk_S0HqTlNBi8jiAFood, Coffee & Tea, Restaurants, Desserts, CafesThe Woodrack Cafe4.0
4710qLrTiIPDlnNX6FYTs29rmgRestaurants, American (Traditional)Buddy's Grill3.5
6720jVdYRED2iztNaNCoTAhVMARestaurants, Salad, Food, DessertsHave A Greener Day5.0
8244iHTL6BPlaPK6xvOa5MIKaQAmerican (Traditional), Restaurants, Food, Ame...Essentially Fries4.0
9834-SFSt3FkjGfavnyMpHsZPADesserts, Food, Cafes, Restaurants, Food Truck...Enjoi Sweets & Company4.5
10337hQcAPRwuYFPAbhbpeNPEgABakeries, American (Traditional), Food, Restau...Apple Farm Diner and Bakery2.5
11701tYCok-NtWvg8_k7woeB83wDesserts, American (Traditional), Cafes, Resta...Grand Lux Cafe3.5
11748newkruvn1rhEvueEc9y1MwFood, Restaurants, Desserts, Ice Cream & Froze...Moo Moo Milk Bar3.5
125069dW3CVyvnTXdkXg2AOyBfwDesserts, Coffee & Tea, Cafes, Donuts, Food, S...Birds Nest Cafe4.5
This post is licensed under CC BY 4.0 by the author.

[PyTorch] 실험: GD vs SGD

NLP/TA 기초

Comments powered by Disqus.