Added python script to remove duplicate files from a directory (#179)

* create script * create README.md * updated README.md * updated README.md
2025-05-14 05:07:17 +00:00 · 2020-10-07 16:14:18 +05:30 · 2020-10-07 16:14:18 +05:30 · 11501c5292
commit 11501c5292
parent 196815c664
3 changed files with 65 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -155,6 +155,7 @@ So far, the following projects have been integrated to this repo:
 |[Send messages to sqs in parallel](send_sqs_messages_in_parallel)|[Jinam Shah](https://github.com/jinamshah)|
 |[Codeforces Checker](codeforcesChecker)|[Jinesh Parakh](https://github.com/jineshparakh)|
 |[Github repo creator](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/Git_repo_creator)|[Harish Tiwari ](https://github.com/optimist2309)
+|[Remove-Duplicate-Files](Remove-Duplicate-Files)|[Aayushi Varma](https://github.com/aayuv17)


 ## How to use :
--- a/Remove-Duplicate-Files/README.md
+++ b/Remove-Duplicate-Files/README.md
@ -0,0 +1,5 @@
+# Remove Duplicate Files
+A python script to find/remove duplicate files from the user specified directory
+
+# Usage
+Simply run the script removeDuplicateFiles.py from the terminal after specifying the path
--- a/Remove-Duplicate-Files/removeDuplicateFiles.py
+++ b/Remove-Duplicate-Files/removeDuplicateFiles.py
@ -0,0 +1,59 @@
+import os
+import hashlib
+
+# function to compute SHA-1 hash of a file
+def computeFileHash(fileName):
+    genHash = hashlib.sha1()
+    with open(fileName, 'rb') as file:
+        block = 0
+        while block!=b'':
+            block = file.read(1024)
+            genHash.update(block)
+    file.close()
+    return genHash.hexdigest()
+
+#function to get list of files present in a directory
+def getFileList(dirPath):
+    listOfFiles=list()
+    for(dirpath, dirnames, filenames) in os.walk(dirPath):
+        listOfFiles+=[os.path.join(dirpath, file) for file in filenames]
+    return listOfFiles
+
+def main():
+    dirPath = input("Enter relative path to directory: ")
+    if not os.path.exists(dirPath):
+        print("Invalid path.")
+        exit()
+    listOfFiles = getFileList(dirPath)
+    duplicateFileSizes={}
+    duplicateFileHashes={}
+    """ grouping files according to their size, so that hashes have to be
+        computed only for files having the same size"""
+    for file in listOfFiles:
+        fileSize = os.path.getsize(file)
+        if fileSize in duplicateFileSizes:
+            duplicateFileSizes[fileSize].append(file)
+        else:
+            duplicateFileSizes[fileSize] = [file]
+    for List in duplicateFileSizes.values():
+        if len(List)>1:
+            for path in List:
+                fileHash = computeFileHash(path)
+                if fileHash in duplicateFileHashes.keys():
+                    duplicateFileHashes[fileHash].append(path)
+                else:
+                    duplicateFileHashes[fileHash]=[path]
+    print("Duplicates in the directory are:")
+    for files in duplicateFileHashes.values():
+        print("(", end='')
+        for fileName in files:
+            print(fileName, end=', ')
+        print(")")
+    delete = input('Enter Y to delete duplicate files: ')
+    if delete=='Y' or delete=='y':
+        for files in duplicateFileHashes.values():
+            for fileName in files[1:]:
+                os.remove(fileName)
+if __name__=='__main__':
+    main()
+