From 11501c52928f82773e142bdf4b4bd883d585c8c9 Mon Sep 17 00:00:00 2001 From: Aayushi Varma <59158445+aayuv17@users.noreply.github.com> Date: Wed, 7 Oct 2020 16:14:18 +0530 Subject: [PATCH] Added python script to remove duplicate files from a directory (#179) * create script * create README.md * updated README.md * updated README.md --- README.md | 1 + Remove-Duplicate-Files/README.md | 5 ++ .../removeDuplicateFiles.py | 59 +++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 Remove-Duplicate-Files/README.md create mode 100644 Remove-Duplicate-Files/removeDuplicateFiles.py diff --git a/README.md b/README.md index bc75b13..b859353 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ So far, the following projects have been integrated to this repo: |[Send messages to sqs in parallel](send_sqs_messages_in_parallel)|[Jinam Shah](https://github.com/jinamshah)| |[Codeforces Checker](codeforcesChecker)|[Jinesh Parakh](https://github.com/jineshparakh)| |[Github repo creator](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/Git_repo_creator)|[Harish Tiwari ](https://github.com/optimist2309) +|[Remove-Duplicate-Files](Remove-Duplicate-Files)|[Aayushi Varma](https://github.com/aayuv17) ## How to use : diff --git a/Remove-Duplicate-Files/README.md b/Remove-Duplicate-Files/README.md new file mode 100644 index 0000000..4e91a2e --- /dev/null +++ b/Remove-Duplicate-Files/README.md @@ -0,0 +1,5 @@ +# Remove Duplicate Files +A python script to find/remove duplicate files from the user specified directory + +# Usage +Simply run the script removeDuplicateFiles.py from the terminal after specifying the path diff --git a/Remove-Duplicate-Files/removeDuplicateFiles.py b/Remove-Duplicate-Files/removeDuplicateFiles.py new file mode 100644 index 0000000..94d14e9 --- /dev/null +++ b/Remove-Duplicate-Files/removeDuplicateFiles.py @@ -0,0 +1,59 @@ +import os +import hashlib + +# function to compute SHA-1 hash of a file +def computeFileHash(fileName): + genHash = hashlib.sha1() + with open(fileName, 'rb') as file: + block = 0 + while block!=b'': + block = file.read(1024) + genHash.update(block) + file.close() + return genHash.hexdigest() + +#function to get list of files present in a directory +def getFileList(dirPath): + listOfFiles=list() + for(dirpath, dirnames, filenames) in os.walk(dirPath): + listOfFiles+=[os.path.join(dirpath, file) for file in filenames] + return listOfFiles + +def main(): + dirPath = input("Enter relative path to directory: ") + if not os.path.exists(dirPath): + print("Invalid path.") + exit() + listOfFiles = getFileList(dirPath) + duplicateFileSizes={} + duplicateFileHashes={} + """ grouping files according to their size, so that hashes have to be + computed only for files having the same size""" + for file in listOfFiles: + fileSize = os.path.getsize(file) + if fileSize in duplicateFileSizes: + duplicateFileSizes[fileSize].append(file) + else: + duplicateFileSizes[fileSize] = [file] + for List in duplicateFileSizes.values(): + if len(List)>1: + for path in List: + fileHash = computeFileHash(path) + if fileHash in duplicateFileHashes.keys(): + duplicateFileHashes[fileHash].append(path) + else: + duplicateFileHashes[fileHash]=[path] + print("Duplicates in the directory are:") + for files in duplicateFileHashes.values(): + print("(", end='') + for fileName in files: + print(fileName, end=', ') + print(")") + delete = input('Enter Y to delete duplicate files: ') + if delete=='Y' or delete=='y': + for files in duplicateFileHashes.values(): + for fileName in files[1:]: + os.remove(fileName) +if __name__=='__main__': + main() +