From 15c7b3b0094abbc56cb5cdf34e6b6f14bfba9329 Mon Sep 17 00:00:00 2001 From: anshrusia200 Date: Mon, 10 Oct 2022 21:08:34 +0530 Subject: [PATCH] Added plagiarism checker script --- scripts/Plagiarism-checker/README.md | 12 +++++++++ scripts/Plagiarism-checker/sample1.txt | 1 + scripts/Plagiarism-checker/sample2.txt | 1 + scripts/Plagiarism-checker/script.py | 36 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+) create mode 100644 scripts/Plagiarism-checker/README.md create mode 100644 scripts/Plagiarism-checker/sample1.txt create mode 100644 scripts/Plagiarism-checker/sample2.txt create mode 100644 scripts/Plagiarism-checker/script.py diff --git a/scripts/Plagiarism-checker/README.md b/scripts/Plagiarism-checker/README.md new file mode 100644 index 0000000..b0b8855 --- /dev/null +++ b/scripts/Plagiarism-checker/README.md @@ -0,0 +1,12 @@ +# Plagiarism Checker + +This is a simple Python script to check plagiarism between 2 files. + +## Using the script + +```bash +# install xlwt +pip install scikit-learn +# run script +python script.py +``` diff --git a/scripts/Plagiarism-checker/sample1.txt b/scripts/Plagiarism-checker/sample1.txt new file mode 100644 index 0000000..3a4d6dc --- /dev/null +++ b/scripts/Plagiarism-checker/sample1.txt @@ -0,0 +1 @@ +A plagiarism checker helps in preventing duplicacy in two files. This way authenticity of content is maintained. \ No newline at end of file diff --git a/scripts/Plagiarism-checker/sample2.txt b/scripts/Plagiarism-checker/sample2.txt new file mode 100644 index 0000000..3a4d6dc --- /dev/null +++ b/scripts/Plagiarism-checker/sample2.txt @@ -0,0 +1 @@ +A plagiarism checker helps in preventing duplicacy in two files. This way authenticity of content is maintained. \ No newline at end of file diff --git a/scripts/Plagiarism-checker/script.py b/scripts/Plagiarism-checker/script.py new file mode 100644 index 0000000..7910cbb --- /dev/null +++ b/scripts/Plagiarism-checker/script.py @@ -0,0 +1,36 @@ +# Plagiarism detector using cosine similarity +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + + +def Plagiarism_Checker(files, student): + results = set() + # converting text from the text file and storing into an array + v = lambda Text: TfidfVectorizer().fit_transform(Text).toarray() + # comparing of two data from two text files + similarity = lambda doc1, doc2: cosine_similarity([doc1, doc2]) + vectors = list(zip(files, v(student))) + + for stud, text_vector_a in vectors: + n_vectors = vectors.copy() + i = n_vectors.index((stud, text_vector_a)) + del n_vectors[i] + for stud2, vector2 in n_vectors: + # matching similairty score by comparing elements present + # in an array + sim_score = similarity(text_vector_a, vector2)[0][1] + stud_pair = sorted((stud, stud2)) + match_per = (stud_pair[0], stud_pair[1],sim_score) + results.add(match_per) + #returns the score for matching between 2 files. percent match = score*100 % + return results +student_files = ["sample1.txt", "sample2.txt"] +student_notes = [] +for file in student_files: + # opening the file present in the current directory + with open(file, "r") as f: + student_notes.append(f.read()) +results = Plagiarism_Checker(student_files, student_notes) + +for result in results: + print("Result: ", result) \ No newline at end of file