#!/usr/bin/python import os, sys, md5 from stat import * # An experimental deduplication ratio checker # http://21stcenturystorage.cebis.net # # # Disclaimer: All code is "use at your own risk". # I provide no warranties or guarantees of anything. # # This program reads all files in the given directory # and roughly estimates possible savings by computing and # checking a hash value for every 4 kB data block # if len(sys.argv) != 2: print "Usage: dedupe-estimate.py " sys.exit(1) workdir = sys.argv[1] if not os.path.isdir(workdir): print "Problem: " + workdir + " is not a directory" sys.exit(1) print "Checking directory: " + workdir # number of files numfiles = 0 # 4 kB blocks needed (de-duped) storage_blocks = 0 # Total # of 4 kB filesystem blocks total_blocks = 0 # Fingerprint index fingerprint_index = {} # check every file in this directory for root, dirs, files in os.walk(workdir): for f in files: filename = root + "/" + f print "Current file: " + filename + " ", # skip snapshots if f==".snapshot": continue # skip unaccessible files if not os.access(filename, os.R_OK): print "No access, skip" continue # Skip symlinks if os.path.islink(filename): print "Symlink, skip" continue # Skip non-regular files mode = os.stat(filename)[ST_MODE] if not S_ISREG(mode): print "No regular file, skip" continue # get file size file_size = os.stat(filename).st_size # number of FS blocks this file needs blocks = int( round( (file_size + 2048) / 4096.0) ) print "File size is " + str(file_size) + " bytes (" + str(blocks) + " blocks)" # read file and compute a MD5 fingerprint for every 4 kB block f = open (filename, "r") for block_number in range(0,blocks): blockdata = f.read(4096) m = md5.new(blockdata) fingerprint = m.digest() # do we have this fingerprint? if yes, continue (no "new" data) if fingerprint_index.has_key( fingerprint ): pass else: fingerprint_index[fingerprint] = True storage_blocks = storage_blocks + 1 total_blocks = total_blocks + 1 f.close() numfiles = numfiles + 1 print "Stats: # files: " + str(numfiles) + ", Used: " + str(storage_blocks) + " blocks, total " + str(total_blocks) + " blocks, Ratio " + str ( (100.0 *storage_blocks)/total_blocks) + "%"