Source code for punx.h5tree

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# -----------------------------------------------------------------------------
# :author:    Pete R. Jemian
# :email:     prjemian@gmail.com
# :copyright: (c) 2014-2018, Pete R. Jemian
#
# Distributed under the terms of the Creative Commons Attribution 4.0 International Public License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# -----------------------------------------------------------------------------

"""
Describe the tree structure of any HDF5 file

.. autosummary::

    ~Hdf5TreeView
"""

import logging
import os
import h5py
import numpy

from . import utils


logger = logging.getLogger(__name__)


[docs]class Hdf5TreeView(object): """ Describe the tree structure of any HDF5 file Example usage showing default display:: mc = Hdf5TreeView(filename) mc.array_items_shown = 5 show_attributes = False txt = mc.report(show_attributes) """ requested_filename = None isNeXus = False array_items_shown = 5 def __init__(self, filename): """store filename and test if file is NeXus HDF5""" self.requested_filename = filename self.filename = None self.show_attributes = True if os.path.exists(filename): self.filename = filename self.isNeXus = utils.isNeXusFile(filename)
[docs] def report(self, show_attributes=True): """ Return the structure of the HDF5 file in a list of strings. The work of parsing the datafile is done in this method. The hierarchy of the file is represented by indentation using spaces. Attributes are signified using ``@``. Group/dataset names are separated from their datatypes using ``:``. A preview of the value of an item follows the ``=``. For example: .. code-block:: python :linenos: [ '/tmp/tmpb7iqqapu.hdf5', ' external_data:NXdata', ' @NX_class = NXdata', ' @signal = x', ' x:int64 = 0', ] """ if self.filename is None: return None self.show_attributes = show_attributes with h5py.File(self.filename, "r") as f: txt = self.filename if self.isNeXus: txt += " : NeXus data file" tree_string_list = self._renderGroup(f, txt, indentation="") return tree_string_list
def _renderGroup(self, obj, name, indentation=" ", md=None): """return a [formatted_string] with the contents of the group Parameters ---------- obj : instance of ``h5py.Group`` name : str the name of the group md : dict If group was an ExternalLink, then keys ``filename`` and ``path`` describe the external link point. If not ExternalLink, the dictionary contents will not be used. """ s = [] nxclass = obj.attrs.get("NX_class", "") if len(nxclass) > 0: if isinstance( nxclass, numpy.ndarray ): # attribute reported as DATATYPE SIMPLE nxclass = nxclass[0] # convert as if DATATYPE SCALAR nxclass = ":" + utils.decode_byte_string(nxclass) s += [indentation + name + nxclass] extra_attrs = {} if isinstance(md, h5py.ExternalLink): # also report external group links (file & path) extra_attrs = dict(file=md.filename, path=md.path) s += self._renderAttributes(obj, indentation, extra_attrs) # show datasets and links next groups = [] for itemname in sorted(obj): link_info = obj.get(itemname, getlink=True) # prevent fails of obj.get(itemname, getclass=True) # for external links if file is not available if ( isinstance(link_info, h5py.ExternalLink) and not os.path.exists(link_info.filename) ): classref = None logger.debug( "FileNotFound: external file=%s external HDF5 addr=%s", link_info.filename, link_info.path ) elif isinstance(link_info, h5py.SoftLink): classref = None logger.debug("SoftLink: HDF5 addr=%s", link_info.path) else: classref = obj.get(itemname, getclass=True) if classref is None: if isinstance(link_info, h5py.SoftLink): s += ["%s %s: --> %s" % (indentation, itemname, link_info.path)] else: s += ["%s %s: missing external file" % (indentation, itemname)] if self.show_attributes: for nm, attr in ("file", "filename"), ("path", "path"): v = getattr(link_info, attr, None) if v is not None: s += [self._renderSingleAttribute(indentation + " ", nm, v)] else: value = obj.get(itemname) if utils.isNeXusLink(value): s += self._renderLinkedObject(value, itemname, indentation + " ") elif utils.isHdf5Group(value) or utils.isHdf5FileObject(value): groups.append((value, itemname, link_info)) elif utils.isHdf5Dataset(value): s += self._renderDataset(value, itemname, indentation + " ") if self.show_attributes and utils.isHdf5ExternalLink( obj, link_info ): # TODO: is obj the "parent" # When "classref" is defined, then external data is available s += self._renderSingleAttribute(indentation + " ", "file", link_info.filename) s += self._renderSingleAttribute(indentation + " ", "path", link_info.path) else: msg = ( "unidentified %s: %s, %s", itemname, repr(classref), repr(link_info), ) raise Exception(msg) for value, itemname, md in groups: # show things that look like groups g = self._renderGroup(value, itemname, indentation + " ", md) s += g return s def _renderSingleAttribute(self, indentation, name, value): value = utils.decode_byte_string(value) # Wrap str and list of str in double quotes. if isinstance(value, list) and isinstance(value[0], str): value = '["' + '", "'.join(value) + '"]' elif isinstance(value, str): value = f'"{value}"' return f'{indentation} @{name} = {value}' def _renderAttributes(self, obj, indentation=" ", extra={}): """return a [formatted_string] with any attributes""" s = [] if self.show_attributes: for d in (obj.attrs, extra): for name, value in d.items(): s.append( self._renderSingleAttribute(indentation, name, value) ) return s def _renderLinkedObject(self, obj, name, indentation=" "): """return a [formatted_string] with the name and target of a NeXus linked object""" s = [] s.append("%s%s --> %s" % (indentation, name, obj.attrs["target"])) return s def _renderDataset(self, dset, name, indentation=" "): """return a [formatted_string] with the contents and structure of a dataset""" shape = dset.shape # dset.dtype.kind == 'S', nchar = dset.dtype.itemsize if self.isNeXus: if "target" in dset.attrs: if dset.attrs["target"] != dset.name: return [ "%s%s --> %s" % ( indentation, name, utils.decode_byte_string(dset.attrs["target"]), ) ] txType = self._renderDsType(dset) txShape = self._renderDsShape(dset) s = [] if dset.dtype.kind == "S": if isinstance(dset[()], numpy.ndarray): ss = ['"' + utils.decode_byte_string(ss) + '"' for ss in dset[()]] if len(ss) > 1: value = " = [%s]" % ", ".join(ss) else: value = " = %s" % ", ".join(ss) else: value = " = %s" % utils.decode_byte_string(dset[()]) s += ["%s%s:%s%s" % (indentation, name, txType, value)] s += self._renderAttributes(dset, indentation) # dset.dtype.kind == 'S', nchar = dset.dtype.itemsize elif dset.dtype.kind == "O": value = " = %s" % str(dset[()]) s += ["%s%s:%s%s" % (indentation, name, txType, value)] s += self._renderAttributes(dset, indentation) elif shape == (1,): value = " = %s" % str(dset[0]) s += ["%s%s:%s%s%s" % (indentation, name, txType, txShape, value)] s += self._renderAttributes(dset, indentation) else: if self.array_items_shown > 2: value = self._renderArray(dset, indentation + " ") if len(dset.shape) < 2: # show the array inline with the field s += [ "%s%s:%s%s = %s" % ( indentation, name, txType, txShape, utils.decode_byte_string(value), ) ] else: # show multi-D arrays different s += ["%s%s:%s%s = __array" % (indentation, name, txType, txShape)] s += [ "%s %s = %s" % (indentation, "__array", utils.decode_byte_string(value)) ] else: s += ["%s%s:%s%s = [ ... ]" % (indentation, name, txType, txShape)] # show these after __array s += self._renderAttributes(dset, indentation) return s def _renderDsType(self, obj): """get the storage (data) type of the dataset""" t = str(obj.dtype) # dset.dtype.kind == 'S', nchar = dset.dtype.itemsize if obj.dtype.kind == "S": # fixed-length string if len(obj.shape): t = "char[%s]" % ",".join([str(o.dtype.itemsize) for o in obj]) else: t = "CHAR" elif obj.dtype.kind == "O": # variable-length string t = "CHAR" if self.isNeXus: t = "NX_" + t.upper() return t def _renderDsShape(self, obj): """return the shape of the HDF5 dataset""" s = obj.shape l = [] for dim in s: l.append(str(dim)) if l == ["1"]: result = "" else: result = "[%s]" % ",".join(l) return result def _renderArray(self, obj, indentation=" "): """nicely format an array up to arbitrary rank""" shape = obj.shape r = "" if len(shape) > 0: r = self._renderNdArray(obj, indentation + " ") return r def _decideNumShown(self, n): """determine how many values to show""" if self.array_items_shown is not None: if n > self.array_items_shown: n = self.array_items_shown - 2 return n def _renderNdArray(self, obj, indentation=" "): """return a list of lower-dimension arrays, nicely formatted""" def __render(obj, rank, key, indents): if rank == 1: item = obj[key] elif rank < 4: # this replaces a lot of code: if rank == ... indices = ", ".join([str(key)] + (":" * (rank - 1)).split()) part = eval("obj[%s]" % indices) item = self._renderNdArray(part, indents + " ") # recursion else: item = "rank=%d" % (rank - 1) return item shape = obj.shape rank = len(shape) if rank < 1: return None n = self._decideNumShown(shape[0]) r = [] for i in range(n): r.append(__render(obj, rank, i, indentation + " ")) if n < shape[0]: r.append("...") # skip over most r.append(__render(obj, rank, -1, indentation + " ")) # last one if rank == 1: s = str(r) else: s = "[\n" + indentation + " " s += ("\n" + indentation + " ").join(r) s += "\n" + indentation + "]" return s