bdisk/bdisk/mtree.py

397 lines
17 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
import argparse
import copy
import datetime
2018-09-23 07:55:58 -04:00
import grp
import hashlib
import os
import pathlib
2018-09-23 07:55:58 -04:00
import platform
import pwd
import re
2018-09-23 07:55:58 -04:00
import stat
from collections import OrderedDict
try:
import pycksum
has_cksum = True
except ImportError:
has_cksum = False
# Parse BSD mtree spec files.
# On arch, BSD mtree is ported in the AUR as nmtree.
2018-09-23 07:55:58 -04:00
# TODO: add a generator class as well? (in process)
# TODO: add a checking function as well?
2018-09-23 07:55:58 -04:00
# The format used for headers
_header_strptime_fmt = '%a %b %d %H:%M:%S %Y'
# Supported hash types (for generation). These are globally available always.
_hashtypes = ['md5', 'sha1', 'sha256', 'sha384', 'sha512']
# If RIPEMD-160 is supported, we add it (after MD5).
if 'ripemd160' in hashlib.algorithms_available:
_hashtypes.insert(1, 'rmd160')
# Iterative to determine which type an item is.
_stype_map = {'block': stat.S_ISBLK,
'char': stat.S_ISCHR,
'dir': stat.S_ISDIR,
'fifo': stat.S_ISFIFO,
'file': stat.S_ISREG,
'link': stat.S_ISLNK,
'socket': stat.S_ISSOCK}
# Regex pattern for cleaning up an octal perm mode into a string representation.
_octre = re.compile('^0o')
class MTreeGen(object):
def __init__(self, path):
self.path = pathlib.PosixPath(os.path.abspath(os.path.expanduser(path)))
# These are used to keep a cached copy of the info.
self._sysinfo = {'uids': {}, 'gids': {}}
self._build_header()
# We use this to keep track of where we are exactly in the tree so we can generate a full absolute path at
# any moment relative to the tree.
self._path_pointer = copy.deepcopy(self.path)
def paths_iterator(self):
for root, dirs, files in os.walk(self.path):
for f in files:
_fname = self.path.joinpath(f)
_stats = self._get_stats(_fname)
if not _stats:
print(('WARNING: {0} either disappeared while we were trying to parse it or '
'it is a broken symlink.').format(_fname))
continue
# TODO: get /set line here?
item = ' {0} \\\n'.format(f)
_type = 'file' # TODO: stat this more accurately
_cksum = self._gen_cksum(_fname)
item += ' {0} {1} {2}\\\n'.format(_stats['size'],
_stats['time'],
('{0} '.format(_cksum) if _cksum else ''))
# TODO: here's where the hashes would get added
# TODO: here's where we parse dirs. maybe do that before files?
# remember: mtree specs use ..'s to traverse upwards when done with a dir
for d in dirs:
_dname = self.path.joinpath(d)
_stats = self._get_stats(_dname)
if not _stats:
print(('WARNING: {0} either disappeared while we were trying to parse it or '
'it is a broken symlink.').format(_dname))
continue
# TODO: get /set line here?
return()
def _gen_cksum(self, fpath):
if not has_cksum:
return(None)
if not os.path.isfile(fpath):
return(None)
# TODO: waiting on https://github.com/sobotklp/pycksum/issues/2 for byte iteration (because large files maybe?)
c = pycksum.Cksum()
with open(fpath, 'rb') as f:
c.add(f)
return(c.get_cksum())
def _get_stats(self, path):
stats = {}
try:
_st = os.stat(path, follow_symlinks = False)
except FileNotFoundError:
# Broken symlink? Shouldn't occur since follow_symlinks is False anyways, BUT...
return(None)
# Ownership
stats['uid'] = _st.st_uid
stats['gid'] = _st.st_gid
if _st.st_uid in self._sysinfo['uids']:
stats['uname'] = self._sysinfo['uids'][_st.st_uid]
else:
_pw = pwd.getpwuid(_st.st_uid).pw_name
stats['uname'] = _pw
self._sysinfo['uids'][_st.stuid] = _pw
if _st.st_gid in self._sysinfo['gids']:
stats['gname'] = self._sysinfo['gids'][_st.st_gid]
else:
_grp = grp.getgrgid(_st.st_gid).gr_name
stats['gname'] = _grp
self._sysinfo['gids'][_st.stgid] = _grp
# Type and Mode
for t in _stype_map:
if _stype_map[t](_st.st_mode):
stats['type'] = t
# TODO: need a reliable way of parsing this.
# for instance, for /dev/autofs, _st.st_dev = 6 (os.makedev(6) confirms major is 0, minor is 6)
# but netBSD mtree (ported) says it's "0xaeb" (2795? or, as str, "®b" apparently).
# I'm guessing the kernel determines this, but where is it pulling it from/how?
# We can probably do 'format,major,minor' (or, for above, 'linux,0,6').
# if t in ('block', 'char'):
# stats['device'] = None
# Handle symlinks.
if t == 'link':
_target = path
while os.path.islink(_target):
_target = os.path.realpath(_target)
stats['link'] = _target
break
stats['mode'] = '{0:0>4}'.format(_octre.sub('', str(oct(stat.S_IMODE(_st.st_mode)))))
stats['size'] = _st.st_size
stats['time'] = str(float(_st.st_mtime))
stats['nlink'] = _st.st_nlink
# TODO: "flags" keyword? is that meaningful on linux?
stats['flags'] = 'none'
return(stats)
def _gen_hashes(self, fpath):
hashes = OrderedDict({})
if not os.path.isfile(fpath):
return(hashes)
_hashnums = len(_hashtypes)
for idx, h in enumerate(_hashtypes):
# Stupid naming inconsistencies.
_hashname = (h if h is not 'rmd160' else 'ripemd160')
_hasher = hashlib.new(_hashname)
with open(fpath, 'rb') as f:
# Hash 64kb at a time in case it's a huge file. TODO: is this the most ideal chunk size?
_hashbuf = f.read(64000)
while len(_hashbuf) > 0:
_hasher.update(_hashbuf)
_hashbuf = f.read(64000)
hashes[h] = _hasher.hexdigest()
return(hashes)
# if idx + 1 < _hashnums:
# hashes += ' {0}={1} \\\n'.format(h, _hasher.hexdigest())
# else:
# hashes += ' {0}={1}\n'.format(h, _hasher.hexdigest())
# return(hashes)
def _build_header(self):
self.spec = ''
_header = OrderedDict({})
_header['user'] = pwd.getpwuid(os.geteuid()).pw_name
_header['machine'] = platform.node()
_header['tree'] = str(self.path)
_header['date'] = datetime.datetime.utcnow().strftime(_header_strptime_fmt)
for h in _header:
self.spec += '#\t{0:>7}: {1}\n'.format(h, _header[h])
self.spec += '\n'
return()
class MTreeParse(object):
def __init__(self, spec):
if not isinstance(spec, (str, bytes)):
raise ValueError('spec must be a raw string of the spec or a bytes object of the string')
if isinstance(spec, bytes):
try:
spec = spec.decode('utf-8')
except UnicodeDecodeError:
raise ValueError('spec must be a utf-8 encoded set of bytes if using byte mode')
self.orig_spec = copy.deepcopy(spec) # For referencing in case someone wanted to write it out.
# We NOW need to handle the escaped linebreaking it does.
self._specdata = re.sub('\\\\\s+', '', spec).splitlines()
self._get_header()
self.spec = {'header': self.header,
'paths': {}}
# Template for an item.
# Default keywords are:
# flags, gid, link, mode, nlink, size, time, type, uid
self._tplitem = {
'type': None, # ('block', 'char', 'dir', 'fifo', 'file', 'link', 'socket')
# checksum of file (if it's a file) (int)
# On all *nix platforms, the cksum(1) utility (which is what the mtree spec uses) follows
# the POSIX standard CRC (which is NOT CRC-1/CRC-16 nor CRC32!):
# http://pubs.opengroup.org/onlinepubs/009695299/utilities/cksum.html
# For a python implementation,
# https://stackoverflow.com/questions/6835381/python-equivalent-of-unix-cksum-function
# See also crcmod (in PyPi).
'cksum': None,
# "The device number to use for block or char file types." Should be converted to a tuple of one
# of the following:
# - (format(str), major(int), minor(int))
# - (format(str), major(int), unit(str?), subunit(str?)) (only used on bsdos formats)
# - (number(int?), ) ("opaque" number)
# Valid formats are, per man page of mtree:
# native, 386bsd, 4bsd, bsdos, freebsd, hpux, isc, linux, netbsd, osf1, sco, solaris, sunos,
# svr3, svr4, ultrix
'device': None,
# File flags as symbolic name. BSD-specific thing? TODO: testing on BSD system
'flags': [],
'ignore': False, # An mtree-internal flag to ignore hierarchy under this item
'gid': None, # The group ID (int)
'gname': None, # The group name (str)
'link': None, # The link target/source, if a link.
# The MD5 checksum digest (str? hex?). "md5digest" is a synonym for this, so it's consolidated in
# as the same keyword.
'md5': None,
# The mode (in octal) (we convert it to a python-native int for os.chmod/stat, etc.)
# May also be a symbolic value; TODO: map symbolic to octal/int.
'mode': None,
'nlink': None, # Number of hard links for this item.
'optional': False, # This item may or may not be present in the compared directory for checking.
'rmd160': None, # The RMD-160 checksum of the file. "rmd160digest" is a synonym.
'sha1': None, # The SHA-1 sum. "sha1digest" is a synonym.
'sha256': None, # SHA-2 256-bit checksum; "sha256digest" is a synonym.
'sha384': None, # SHA-2 384-bit checksum; "sha384digest" is a synonym.
'sha512': None, # SHA-2 512-bit checksum; "sha512digest" is a synonym.
'size': None, # Size of the file in bytes (int).
'tags': [], # mtree-internal tags (comma-separated in the mtree spec).
'time': None, # Time the file was last modified (in Epoch fmt as float).
'uid': None, # File owner UID (int)
'uname': None # File owner username (str)
# And lastly, "children" is where the children files/directories go. We don't include it in the template;
# it's added programmatically.
# 'children': {}
}
# Global aspects are handled by "/set" directives.
# They are restored by an "/unset". Since they're global and stateful, they're handled as a class attribute.
self.settings = copy.deepcopy(self._tplitem)
self._parse_items()
2018-09-23 07:55:58 -04:00
del(self.settings, self._tplitem)
def _get_header(self):
self.header = {}
_headre = re.compile('^#\s+(user|machine|tree|date):\s')
_cmtre = re.compile('^\s*#\s*')
_blklnre = re.compile('^\s*$')
for idx, line in enumerate(self._specdata):
if _headre.search(line): # We found a header item.
l = [i.lstrip() for i in _cmtre.sub('', line).split(':', 1)]
header = l[0]
val = (l[1] if l[1] is not '(null)' else None)
if header == 'date':
2018-09-23 07:55:58 -04:00
val = datetime.datetime.strptime(val, _header_strptime_fmt)
elif header == 'tree':
val = pathlib.PosixPath(val)
self.header[header] = val
elif _blklnre.search(line):
break # We've reached the end of the header. Otherwise...
else: # We definitely shouldn't be here, but this means the spec doesn't even have a header.
break
return()
def _parse_items(self):
# A pattern (compiled for performance) to match commands.
_stngsre = re.compile('^/(un)?set\s')
# Per the man page:
# "Empty lines and lines whose first non-whitespace character is a hash mark (#) are ignored."
_ignre = re.compile('^(\s*(#.*)?)?$')
# The following regex is used to quickly and efficiently check for a synonymized hash name.
_hashre = re.compile('^(md5|rmd160|sha1|sha256|sha384|sha512)(digest)?$')
# The following regex is to test if we need to traverse upwards in the path.
_parentre = re.compile('^\.{,2}/?$')
# _curpath = self.header['tree']
_curpath = pathlib.PosixPath('/')
_types = ('block', 'char', 'dir', 'fifo', 'file', 'link', 'socket')
# This parses keywords. Used by both item specs and /set.
def _kwparse(kwline):
out = {}
for i in kwline:
l = i.split('=', 1)
if len(l) < 2:
l.append(None)
k, v = l
if v == 'none':
v = None
# These are represented as octals.
if k in ('mode', ):
# TODO: handle symbolic references too (e.g. rwxrwxrwx)
if v.isdigit():
v = int(v, 8) # Convert from the octal. This can then be used directly with os.chmod etc.
# These are represented as ints
elif k in ('uid', 'gid', 'cksum', 'nlink'):
if v.isdigit():
v = int(v)
# These are booleans (represented as True by their presence).
elif k in ('ignore', 'optional'):
v = True
# These are lists (comma-separated).
elif k in ('flags', 'tags'):
if v:
v = [i.strip() for i in v.split(',')]
# The following are synonyms.
elif _hashre.search(k):
k = _hashre.sub('\g<1>', k)
elif k == 'time':
v = datetime.datetime.fromtimestamp(float(v))
elif k == 'type':
if v not in _types:
raise ValueError('{0} not one of: {1}'.format(v, ', '.join(_types)))
out[k] = v
return(out)
def _unset_parse(unsetline):
out = {}
2018-09-23 07:55:58 -04:00
if unsetline[1] == 'all':
return(copy.deepcopy(self._tplitem))
for i in unsetline:
out[i] = self._tplitem[i]
return(out)
# The Business-End (TM)
for idx, line in enumerate(self._specdata):
_fname = copy.deepcopy(_curpath)
# Skip these lines
if _ignre.search(line):
continue
l = line.split()
if _parentre.search(line):
_curpath = _curpath.parent
elif not _stngsre.search(line):
# So it's an item, not a command.
_itemsettings = copy.deepcopy(self.settings)
_itemsettings.update(_kwparse(l[1:]))
if _itemsettings['type'] == 'dir':
# SOMEONE PLEASE let me know if there's a cleaner way to do this.
_curpath = pathlib.PosixPath(os.path.normpath(_curpath.joinpath(l[0])))
_fname = _curpath
else:
_fname = pathlib.PosixPath(os.path.normpath(_curpath.joinpath(l[0])))
self.spec['paths'][_fname] = _itemsettings
else:
# It's a command. We can safely split on whitespace since the man page specifies the
# values are not to contain whitespace.
# /set
if l[0] == '/set':
del(l[0])
self.settings.update(_kwparse(l))
# /unset
else:
self.settings.update(_unset_parse(l))
continue
return()
def parseArgs():
args = argparse.ArgumentParser(description = 'An mtree parser')
# TODO: support stdin piping
args.add_argument('specfile',
help = 'The path to the spec file to parse')
return(args)
# Allow to be run as a CLI utility as well.
def main():
args = vars(parseArgs().parse_args())
import os
with open(os.path.abspath(os.path.expanduser(args['specfile']))) as f:
mt = MTreeParse(f.read())
with open('/tmp/newspec', 'w') as f:
f.write('\n'.join(mt._specdata))
import pprint
import inspect
del(mt.orig_spec)
del(mt._specdata)
import shutil
pprint.pprint(inspect.getmembers(mt), width = shutil.get_terminal_size()[0])
if __name__ == '__main__':
main()