Merge pull request #213 from neocollege/neo-user

add sentiment analysis project
2025-05-05 18:43:58 +00:00 · 2022-10-09 02:19:21 +05:30 · 2022-10-09 02:19:21 +05:30 · e85beacbc3
commit e85beacbc3
parent 153057b99c f0f9c4dff1
2 changed files with 1146 additions and 0 deletions
--- a/Analysis/Restaurant_Reviews.tsv
+++ b/Analysis/Restaurant_Reviews.tsv
--- a/Projects/Sentiment
+++ b/Projects/Sentiment
@ -0,0 +1,145 @@
+## analysing text, to predict whether the rest review is positive or not
+
+#supervised learning..
+
+# import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+# import streamlit as st
+#importing dataset
+dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
+
+#Cleaning the texts .. remove stopwords etc, stemming.. similar words like love or loved
+# makes all in small letters
+# sparse matrix
+#Bag of words basis
+
+#First cleaning is applied to the first record only
+import re   #library used in cleaning
+review = re.sub('[^a-zA-Z]',' ',dataset['Review'][0]) #.. removing letters other than a to z
+review = review.lower()  # at this stage Review is availble in the form of a string
+#remove non significant words
+import nltk
+# nltk.download('stopwords')
+from nltk.corpus import stopwords
+review = review.split()  # the split finction converts the string into list
+review = [word for word in review if not word in set(stopwords.words('english'))]
+
+#Stemming .. replace words which are similar like love and loved to the root word like love
+from nltk.stem.porter import PorterStemmer
+ps = PorterStemmer()
+ml = []
+for word in review:
+    st = ps.stem(word)
+    ml.append(st)
+review = ml
+
+#joining the words to make a string of cleaned words
+review = ' '.join(review) # list is converted back to string
+    
+
+corpus = []
+corpus.append(review) # string is put back into a list (as a combined string)
+# print(len(corpus))
+##---------------------------------
+#Now the above cleaning is applied to all the remaning records from index 1 to 999
+
+for i in range(1,1000):
+    review1 = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
+    review1 = review1.lower()
+    review1 = review1.split()
+    review1 = [word for word in review1 if not word in set(stopwords.words('english'))]
+    ml1 = []
+    for word in review1:
+        st1 = ps.stem(word)
+        ml1.append(st1)
+        review1 = ml1
+    review1 = ' '.join(review1)
+    corpus.append(review1)
+##-----------------------------------
+#  To create a bag of words model
+#  it is same as creating a sparse matrix through the process of tokenisation
+# ie to create a separate column for each of the word
+# so finally what we get is a review, column for each word and its frequency
+# The sparse matrix would essentially contain all the required features / feature matrix
+#   Once we get the above bag of words, we shud be apply the classification template
+# this process is handled through a class called as countvectorizer
+
+
+from sklearn.feature_extraction.text import CountVectorizer
+cv = CountVectorizer(max_features = 1500)
+X10 = cv.fit_transform(corpus)
+# print(cv.get_feature_names())
+X1 = X10.toarray()
+
+y = dataset.iloc[:, 1].values
+##-----------------------------------
+# Splitting the dataset into the Training set and Test set
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.20, random_state = 0)
+
+# Fitting Naive Bayes to the Training set
+from sklearn.naive_bayes import GaussianNB
+classifier = GaussianNB()
+classifier.fit(X_train, y_train)
+# plt.hist(X_train)
+plt.hist(y_train, label='positive (1) and negative (0)')
+plt.legend()
+plt.show()
+
+
+# Predicting the Test set results
+y_pred = classifier.predict(X_test)
+
+# Making the Confusion Matrix
+
+from sklearn.metrics import confusion_matrix, accuracy_score
+cm = confusion_matrix(y_test, y_pred)
+print('Confusion Matrix: \n', cm)
+acc_cm =(cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
+print('Accuracy of the model based on confusion matrix: ', acc_cm)
+acc_score1 = accuracy_score(y_test, y_pred)
+print('Accuracy score of the model: ', acc_score1)
+
+
+
+#- ------------------------------------------------------------
+#classifying the sehensence entered by the user based on the above model
+
+userinput = input("Enter your input :     ")
+# userinput = st.text_area("Enter your input :     ")
+
+review2 = re.sub('[^a-zA-Z]',' ',userinput) #.. removing letters other than a to z
+review2 = review2.lower()
+
+review2 = review2.split()
+review2 = [word for word in review2 if not word in set(stopwords.words('english'))]
+
+ml13 = []
+for word in review2:
+    st13 = ps.stem(word)
+    ml13.append(st13)
+    review2 = ml13
+
+#joining the words to make a string of cleaned words
+review2 = ' '.join(review2)
+
+import copy
+corpus_copy = copy.deepcopy(corpus)
+corpus_copy.append(review2)
+
+#It is necessary to put the new record below the existing records so that the new record is also
+#arranged in the same manner..
+
+cv = CountVectorizer(max_features = 1500)
+X111 = cv.fit_transform(corpus_copy).toarray()
+
+y_pred1 = classifier.predict(X111[-1:][:])
+print(y_pred1)
+
+if y_pred1 == 1:
+    print("The customer feedback is positive")
+else:
+    print("The customer feedback is negative")
+
+corpus_copy = []