forked from gnyers/python-tuesday
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofiler
executable file
·125 lines (101 loc) · 4.39 KB
/
profiler
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
'''Recursively analyze a directory structure and provide some stats about the
disk space usage of the entire content, the number of files, directories, hard-
en symlinks.
Usage:
./profiler exampledir
Example output of a simple directory structure:
--- Statistics of directory: exampledir/
Number of errors encountered while processing 0
The number of dirs: 6
The number of non-dirs (i.e.: files, hard- and symlinks): 8
The number of symlinks: 1
The number of hardlinks: 3
Used disk space: 524 bytes
Which of the hardlinks point to the same file?
exampledir/file1.dat, exampledir/a/file2.bin, exampledir/c/e/file15 : 7 bytes
'''
import os
import sys
from os.path import join, isdir, isfile, islink
# Program details
__author__ = 'Gábor Nyers'
__version__ = '0.0.1'
__date__ = '2022-05-31'
__license__ = 'GPLv3'
version_info = tuple(__version__.split('.'))
# Variable defs
startdir = sys.argv[1] # get CLI argument 'dir'
data = {} # data structure
def getattrs(path):
'''Return a customized set of attributes of path
'''
attrs = os.lstat(path)
ret = { 'isdir': isdir(path), # is it a directory? True/False
'isfile': isfile(path), # is it a file?
# NOTE: also True if hard- or symlink!
'issymlink': islink(path), # is it a symlink?
'refcount': attrs.st_nlink, # how many filenames to same inode?
# NOTE: >1 if hardlink!
'size': attrs.st_size, # the nr. of bytes of disk space consumed
'inode': attrs.st_ino, # the inode's **unique** id
# NOTE: find out names pointing to same file
}
# Hardlinks are files that have multiple names, in the same or different
# directory
if ret['isfile'] and ret['refcount'] > 1:
ret['ishardlink'] = True
return ret
# Initialize stuff
errors = {}
total_size = 0
# recursively iterate through the `startdir`,
# **whithout** following symbolic links
for path, subdirs, files in os.walk(startdir, followlinks=False):
data[path] = getattrs(path)
data[path].update({'nr_of_nondirs': len(files)}) # how many non-dir
# objects in curr. dir?
sizes = 0
for f in files: # nested loop to inspect dir. objects
try: # in case somthing goes wrong...
fpath = join(path, f)
data[fpath] = getattrs(fpath)
sizes += data[fpath]['size']
except Exception as e: # ... handle any run-time errors
errors[fpath] = e.args # remember which file and what error
data[path].update({'disk_usage': sizes}) # how many non-dir
### For debug purposes: uncomment to see what the content is of `data` is
# for p, attrs in data.items():
# print(f'{p}: {attrs}')
### Print results:
print(f'--- Statistics of directory: {startdir}')
print(f'Number of errors encountered while processing {len(errors)}')
print(f' {", ".join(errors.keys())}')
dirs = [ path for path,attrs in data.items() if attrs['isdir'] ]
print(f'The number of dirs: {len(dirs)}')
files = [ path for path,attrs in data.items() if not(attrs['isdir']) ]
print(f'The number of non-dirs (i.e.: files, hard- and symlinks): {len(files)}')
symlinks = [ path for path,attrs in data.items() if attrs.get('issymlink') ]
print(f'The number of symlinks: {len(symlinks)}')
hardlinks = [ path for path,attrs in data.items() if attrs.get('ishardlink') ]
print(f'The number of hardlinks: {len(hardlinks)}')
total_disk_usage = sum([ attrs.get('disk_usage',0)
for path,attrs in data.items()
if attrs['isdir']
])
print(f'Used disk space: {total_disk_usage} bytes')
print('Which of the hardlinks point to the same file? ')
distict_files = {}
for path in hardlinks:
distict_files.setdefault(data[path]['inode'], []).append(path)
for inode, paths in distict_files.items():
fsize = data[paths[0]]['size']
print(f' {", ".join(paths)} : {fsize} bytes')
# **Challenge:**
#
# The method used to calculate the `total_disk_usage` does not take hardlinks
# into account.
#
# (1) Why?
# (2) Based on the `distict_files` dictionary, you could correct the
# value of `total_disk_usage` to relect the **exact** disk usage. How?