python-scripts/scripts/Sentiment Analysis/senti.py

## analysing text, to predict whether the rest review is positive or not

#supervised learning..

# import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# import streamlit as st
#importing dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

#Cleaning the texts .. remove stopwords etc, stemming.. similar words like love or loved
# makes all in small letters
# sparse matrix
#Bag of words basis

#First cleaning is applied to the first record only
import re   #library used in cleaning
review = re.sub('[^a-zA-Z]',' ',dataset['Review'][0]) #.. removing letters other than a to z
review = review.lower()  # at this stage Review is availble in the form of a string
#remove non significant words
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
review = review.split()  # the split finction converts the string into list
review = [word for word in review if not word in set(stopwords.words('english'))]

#Stemming .. replace words which are similar like love and loved to the root word like love
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ml = []
for word in review:
    st = ps.stem(word)
    ml.append(st)
review = ml

#joining the words to make a string of cleaned words
review = ' '.join(review) # list is converted back to string
    

corpus = []
corpus.append(review) # string is put back into a list (as a combined string)
# print(len(corpus))
##---------------------------------
#Now the above cleaning is applied to all the remaning records from index 1 to 999

for i in range(1,1000):
    review1 = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review1 = review1.lower()
    review1 = review1.split()
    review1 = [word for word in review1 if not word in set(stopwords.words('english'))]
    ml1 = []
    for word in review1:
        st1 = ps.stem(word)
        ml1.append(st1)
        review1 = ml1
    review1 = ' '.join(review1)
    corpus.append(review1)
##-----------------------------------
#  To create a bag of words model
#  it is same as creating a sparse matrix through the process of tokenisation
# ie to create a separate column for each of the word
# so finally what we get is a review, column for each word and its frequency
# The sparse matrix would essentially contain all the required features / feature matrix
#   Once we get the above bag of words, we shud be apply the classification template
# this process is handled through a class called as countvectorizer


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X10 = cv.fit_transform(corpus)
# print(cv.get_feature_names())
X1 = X10.toarray()

y = dataset.iloc[:, 1].values
##-----------------------------------
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.20, random_state = 0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# plt.hist(X_train)
plt.hist(y_train, label='positive (1) and negative (0)')
plt.legend()
plt.show()


# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n', cm)
acc_cm =(cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
print('Accuracy of the model based on confusion matrix: ', acc_cm)
acc_score1 = accuracy_score(y_test, y_pred)
print('Accuracy score of the model: ', acc_score1)


#- ------------------------------------------------------------
#classifying the sehensence entered by the user based on the above model

userinput = input("Enter your input :     ")
# userinput = st.text_area("Enter your input :     ")

review2 = re.sub('[^a-zA-Z]',' ',userinput) #.. removing letters other than a to z
review2 = review2.lower()

review2 = review2.split()
review2 = [word for word in review2 if not word in set(stopwords.words('english'))]

ml13 = []
for word in review2:
    st13 = ps.stem(word)
    ml13.append(st13)
    review2 = ml13

#joining the words to make a string of cleaned words
review2 = ' '.join(review2)

import copy
corpus_copy = copy.deepcopy(corpus)
corpus_copy.append(review2)

#It is necessary to put the new record below the existing records so that the new record is also
#arranged in the same manner..

cv = CountVectorizer(max_features = 1500)
X111 = cv.fit_transform(corpus_copy).toarray()

y_pred1 = classifier.predict(X111[-1:][:])
print(y_pred1)

if y_pred1 == 1:
    print("The customer feedback is positive")
else:
    print("The customer feedback is negative")

corpus_copy = []
add sentiment analysis project 2022-10-08 20:23:14 +00:00			`## analysing text, to predict whether the rest review is positive or not`

			`#supervised learning..`

			`# import numpy as np`
			`import matplotlib.pyplot as plt`
			`import pandas as pd`
			`# import streamlit as st`
			`#importing dataset`
			`dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)`

			`#Cleaning the texts .. remove stopwords etc, stemming.. similar words like love or loved`
			`# makes all in small letters`
			`# sparse matrix`
			`#Bag of words basis`

			`#First cleaning is applied to the first record only`
			`import re #library used in cleaning`
			`review = re.sub('[^a-zA-Z]',' ',dataset['Review'][0]) #.. removing letters other than a to z`
			`review = review.lower() # at this stage Review is availble in the form of a string`
			`#remove non significant words`
			`import nltk`
			`# nltk.download('stopwords')`
			`from nltk.corpus import stopwords`
			`review = review.split() # the split finction converts the string into list`
			`review = [word for word in review if not word in set(stopwords.words('english'))]`

			`#Stemming .. replace words which are similar like love and loved to the root word like love`
			`from nltk.stem.porter import PorterStemmer`
			`ps = PorterStemmer()`
			`ml = []`
			`for word in review:`
			`st = ps.stem(word)`
			`ml.append(st)`
			`review = ml`

			`#joining the words to make a string of cleaned words`
			`review = ' '.join(review) # list is converted back to string`


			`corpus = []`
			`corpus.append(review) # string is put back into a list (as a combined string)`
			`# print(len(corpus))`
			`##---------------------------------`
			`#Now the above cleaning is applied to all the remaning records from index 1 to 999`

			`for i in range(1,1000):`
			`review1 = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])`
			`review1 = review1.lower()`
			`review1 = review1.split()`
			`review1 = [word for word in review1 if not word in set(stopwords.words('english'))]`
			`ml1 = []`
			`for word in review1:`
			`st1 = ps.stem(word)`
			`ml1.append(st1)`
			`review1 = ml1`
			`review1 = ' '.join(review1)`
			`corpus.append(review1)`
			`##-----------------------------------`
			`# To create a bag of words model`
			`# it is same as creating a sparse matrix through the process of tokenisation`
			`# ie to create a separate column for each of the word`
			`# so finally what we get is a review, column for each word and its frequency`
			`# The sparse matrix would essentially contain all the required features / feature matrix`
			`# Once we get the above bag of words, we shud be apply the classification template`
			`# this process is handled through a class called as countvectorizer`


			`from sklearn.feature_extraction.text import CountVectorizer`
			`cv = CountVectorizer(max_features = 1500)`
			`X10 = cv.fit_transform(corpus)`
			`# print(cv.get_feature_names())`
			`X1 = X10.toarray()`

			`y = dataset.iloc[:, 1].values`
			`##-----------------------------------`
			`# Splitting the dataset into the Training set and Test set`
			`from sklearn.model_selection import train_test_split`
			`X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.20, random_state = 0)`

			`# Fitting Naive Bayes to the Training set`
			`from sklearn.naive_bayes import GaussianNB`
			`classifier = GaussianNB()`
			`classifier.fit(X_train, y_train)`
			`# plt.hist(X_train)`
			`plt.hist(y_train, label='positive (1) and negative (0)')`
			`plt.legend()`
			`plt.show()`


			`# Predicting the Test set results`
			`y_pred = classifier.predict(X_test)`

			`# Making the Confusion Matrix`

			`from sklearn.metrics import confusion_matrix, accuracy_score`
			`cm = confusion_matrix(y_test, y_pred)`
			`print('Confusion Matrix: \n', cm)`
			`acc_cm =(cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])`
			`print('Accuracy of the model based on confusion matrix: ', acc_cm)`
			`acc_score1 = accuracy_score(y_test, y_pred)`
			`print('Accuracy score of the model: ', acc_score1)`



			`#- ------------------------------------------------------------`
			`#classifying the sehensence entered by the user based on the above model`

			`userinput = input("Enter your input : ")`
			`# userinput = st.text_area("Enter your input : ")`

			`review2 = re.sub('[^a-zA-Z]',' ',userinput) #.. removing letters other than a to z`
			`review2 = review2.lower()`

			`review2 = review2.split()`
			`review2 = [word for word in review2 if not word in set(stopwords.words('english'))]`

			`ml13 = []`
			`for word in review2:`
			`st13 = ps.stem(word)`
			`ml13.append(st13)`
			`review2 = ml13`

			`#joining the words to make a string of cleaned words`
			`review2 = ' '.join(review2)`

			`import copy`
			`corpus_copy = copy.deepcopy(corpus)`
			`corpus_copy.append(review2)`

			`#It is necessary to put the new record below the existing records so that the new record is also`
			`#arranged in the same manner..`

			`cv = CountVectorizer(max_features = 1500)`
			`X111 = cv.fit_transform(corpus_copy).toarray()`

			`y_pred1 = classifier.predict(X111[-1:][:])`
			`print(y_pred1)`

			`if y_pred1 == 1:`
			`print("The customer feedback is positive")`
			`else:`
			`print("The customer feedback is negative")`

			`corpus_copy = []`