find and remove duplicate files

Purushothaman · November 29, 2016, 8:24am

Please use the script to find all duplicate files from your computer on the specific folder or path

Note: Script will find duplicate files based on its hash string value (MD5), not only based on file’ name.
You can also change the path to find duplicates over there!

## find and remove duplicate files, by hash (by file' content)!
import os
import hashlib
def hashfile(path, blocksize = 65536):
    try:
        afile = open(path, 'rb')
        hasher = hashlib.md5()
        buf = afile.read(blocksize)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(blocksize)
        afile.close()
        return hasher.hexdigest()
    except Exception as e:
        print e
def findDup(parentFolder):
    # Dups in format {hash:[names]}
    dups = {}
    for dirName, subdirs, fileList in os.walk(parentFolder):
        print('Scanning %s...' % dirName)
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Calculate hash
            file_hash = hashfile(path)
            # Add or append the file path
            if file_hash in dups:
                dups[file_hash].append(path)
            else:
                dups[file_hash] = [path]
    return dups
def printResults(dict1):
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        print('Duplicates Found:')
        print('The following files are same. The name could differ, but the content is same')
        print('__')
        for result in results:
            for subresult in result:
                print('%s' % subresult)
            print('__')
    else:
        print('No duplicate files found.')
def removeDuplicates(dict1):
    results = list(filter(lambda x: len(x) > 1, dict1.values()))
    if len(results) > 0:
        print '
'
        print 'Removed Files:'
        print('__')
        for result in results:
            for ele in result[:-1]:
                os.remove(ele)
                print ele
            print('__')
    else:
        pass
if __name__ == '__main__':
    dict = findDup('C:\\Users\\Administrator\\Downloads') ## you change your path here...
    printResults(dict)
    removeDuplicates(dict)

Sample Output: