mirror of
https://github.com/metafy-social/python-scripts.git
synced 2024-11-24 04:21:12 +00:00
Merge pull request #213 from neocollege/neo-user
add sentiment analysis project
This commit is contained in:
commit
e85beacbc3
1001
Projects/Sentiment Analysis/Restaurant_Reviews.tsv
Normal file
1001
Projects/Sentiment Analysis/Restaurant_Reviews.tsv
Normal file
File diff suppressed because it is too large
Load Diff
145
Projects/Sentiment Analysis/senti.py
Normal file
145
Projects/Sentiment Analysis/senti.py
Normal file
|
@ -0,0 +1,145 @@
|
|||
## analysing text, to predict whether the rest review is positive or not
|
||||
|
||||
#supervised learning..
|
||||
|
||||
# import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
# import streamlit as st
|
||||
#importing dataset
|
||||
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
|
||||
|
||||
#Cleaning the texts .. remove stopwords etc, stemming.. similar words like love or loved
|
||||
# makes all in small letters
|
||||
# sparse matrix
|
||||
#Bag of words basis
|
||||
|
||||
#First cleaning is applied to the first record only
|
||||
import re #library used in cleaning
|
||||
review = re.sub('[^a-zA-Z]',' ',dataset['Review'][0]) #.. removing letters other than a to z
|
||||
review = review.lower() # at this stage Review is availble in the form of a string
|
||||
#remove non significant words
|
||||
import nltk
|
||||
# nltk.download('stopwords')
|
||||
from nltk.corpus import stopwords
|
||||
review = review.split() # the split finction converts the string into list
|
||||
review = [word for word in review if not word in set(stopwords.words('english'))]
|
||||
|
||||
#Stemming .. replace words which are similar like love and loved to the root word like love
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
ps = PorterStemmer()
|
||||
ml = []
|
||||
for word in review:
|
||||
st = ps.stem(word)
|
||||
ml.append(st)
|
||||
review = ml
|
||||
|
||||
#joining the words to make a string of cleaned words
|
||||
review = ' '.join(review) # list is converted back to string
|
||||
|
||||
|
||||
corpus = []
|
||||
corpus.append(review) # string is put back into a list (as a combined string)
|
||||
# print(len(corpus))
|
||||
##---------------------------------
|
||||
#Now the above cleaning is applied to all the remaning records from index 1 to 999
|
||||
|
||||
for i in range(1,1000):
|
||||
review1 = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
|
||||
review1 = review1.lower()
|
||||
review1 = review1.split()
|
||||
review1 = [word for word in review1 if not word in set(stopwords.words('english'))]
|
||||
ml1 = []
|
||||
for word in review1:
|
||||
st1 = ps.stem(word)
|
||||
ml1.append(st1)
|
||||
review1 = ml1
|
||||
review1 = ' '.join(review1)
|
||||
corpus.append(review1)
|
||||
##-----------------------------------
|
||||
# To create a bag of words model
|
||||
# it is same as creating a sparse matrix through the process of tokenisation
|
||||
# ie to create a separate column for each of the word
|
||||
# so finally what we get is a review, column for each word and its frequency
|
||||
# The sparse matrix would essentially contain all the required features / feature matrix
|
||||
# Once we get the above bag of words, we shud be apply the classification template
|
||||
# this process is handled through a class called as countvectorizer
|
||||
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
cv = CountVectorizer(max_features = 1500)
|
||||
X10 = cv.fit_transform(corpus)
|
||||
# print(cv.get_feature_names())
|
||||
X1 = X10.toarray()
|
||||
|
||||
y = dataset.iloc[:, 1].values
|
||||
##-----------------------------------
|
||||
# Splitting the dataset into the Training set and Test set
|
||||
from sklearn.model_selection import train_test_split
|
||||
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.20, random_state = 0)
|
||||
|
||||
# Fitting Naive Bayes to the Training set
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
classifier = GaussianNB()
|
||||
classifier.fit(X_train, y_train)
|
||||
# plt.hist(X_train)
|
||||
plt.hist(y_train, label='positive (1) and negative (0)')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
|
||||
# Predicting the Test set results
|
||||
y_pred = classifier.predict(X_test)
|
||||
|
||||
# Making the Confusion Matrix
|
||||
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
print('Confusion Matrix: \n', cm)
|
||||
acc_cm =(cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
|
||||
print('Accuracy of the model based on confusion matrix: ', acc_cm)
|
||||
acc_score1 = accuracy_score(y_test, y_pred)
|
||||
print('Accuracy score of the model: ', acc_score1)
|
||||
|
||||
|
||||
|
||||
#- ------------------------------------------------------------
|
||||
#classifying the sehensence entered by the user based on the above model
|
||||
|
||||
userinput = input("Enter your input : ")
|
||||
# userinput = st.text_area("Enter your input : ")
|
||||
|
||||
review2 = re.sub('[^a-zA-Z]',' ',userinput) #.. removing letters other than a to z
|
||||
review2 = review2.lower()
|
||||
|
||||
review2 = review2.split()
|
||||
review2 = [word for word in review2 if not word in set(stopwords.words('english'))]
|
||||
|
||||
ml13 = []
|
||||
for word in review2:
|
||||
st13 = ps.stem(word)
|
||||
ml13.append(st13)
|
||||
review2 = ml13
|
||||
|
||||
#joining the words to make a string of cleaned words
|
||||
review2 = ' '.join(review2)
|
||||
|
||||
import copy
|
||||
corpus_copy = copy.deepcopy(corpus)
|
||||
corpus_copy.append(review2)
|
||||
|
||||
#It is necessary to put the new record below the existing records so that the new record is also
|
||||
#arranged in the same manner..
|
||||
|
||||
cv = CountVectorizer(max_features = 1500)
|
||||
X111 = cv.fit_transform(corpus_copy).toarray()
|
||||
|
||||
y_pred1 = classifier.predict(X111[-1:][:])
|
||||
print(y_pred1)
|
||||
|
||||
if y_pred1 == 1:
|
||||
print("The customer feedback is positive")
|
||||
else:
|
||||
print("The customer feedback is negative")
|
||||
|
||||
corpus_copy = []
|
Loading…
Reference in New Issue
Block a user