Added python script to remove duplicate files from a directory (#179)

* create script

* create README.md

* updated README.md

* updated README.md
This commit is contained in:
Aayushi Varma 2020-10-07 16:14:18 +05:30 committed by GitHub
parent 196815c664
commit 11501c5292
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 65 additions and 0 deletions

View File

@ -155,6 +155,7 @@ So far, the following projects have been integrated to this repo:
|[Send messages to sqs in parallel](send_sqs_messages_in_parallel)|[Jinam Shah](https://github.com/jinamshah)|
|[Codeforces Checker](codeforcesChecker)|[Jinesh Parakh](https://github.com/jineshparakh)|
|[Github repo creator](https://github.com/hastagAB/Awesome-Python-Scripts/tree/master/Git_repo_creator)|[Harish Tiwari ](https://github.com/optimist2309)
|[Remove-Duplicate-Files](Remove-Duplicate-Files)|[Aayushi Varma](https://github.com/aayuv17)
## How to use :

View File

@ -0,0 +1,5 @@
# Remove Duplicate Files
A python script to find/remove duplicate files from the user specified directory
# Usage
Simply run the script removeDuplicateFiles.py from the terminal after specifying the path

View File

@ -0,0 +1,59 @@
import os
import hashlib
# function to compute SHA-1 hash of a file
def computeFileHash(fileName):
genHash = hashlib.sha1()
with open(fileName, 'rb') as file:
block = 0
while block!=b'':
block = file.read(1024)
genHash.update(block)
file.close()
return genHash.hexdigest()
#function to get list of files present in a directory
def getFileList(dirPath):
listOfFiles=list()
for(dirpath, dirnames, filenames) in os.walk(dirPath):
listOfFiles+=[os.path.join(dirpath, file) for file in filenames]
return listOfFiles
def main():
dirPath = input("Enter relative path to directory: ")
if not os.path.exists(dirPath):
print("Invalid path.")
exit()
listOfFiles = getFileList(dirPath)
duplicateFileSizes={}
duplicateFileHashes={}
""" grouping files according to their size, so that hashes have to be
computed only for files having the same size"""
for file in listOfFiles:
fileSize = os.path.getsize(file)
if fileSize in duplicateFileSizes:
duplicateFileSizes[fileSize].append(file)
else:
duplicateFileSizes[fileSize] = [file]
for List in duplicateFileSizes.values():
if len(List)>1:
for path in List:
fileHash = computeFileHash(path)
if fileHash in duplicateFileHashes.keys():
duplicateFileHashes[fileHash].append(path)
else:
duplicateFileHashes[fileHash]=[path]
print("Duplicates in the directory are:")
for files in duplicateFileHashes.values():
print("(", end='')
for fileName in files:
print(fileName, end=', ')
print(")")
delete = input('Enter Y to delete duplicate files: ')
if delete=='Y' or delete=='y':
for files in duplicateFileHashes.values():
for fileName in files[1:]:
os.remove(fileName)
if __name__=='__main__':
main()