Source code for punx.h5tree

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#-----------------------------------------------------------------------------
# :author:    Pete R. Jemian
# :email:     prjemian@gmail.com
# :copyright: (c) 2014-2018, Pete R. Jemian
#
# Distributed under the terms of the Creative Commons Attribution 4.0 International Public License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
#-----------------------------------------------------------------------------

"""
Describe the tree structure of any HDF5 file

.. autosummary::

    ~Hdf5TreeView
"""

import os       #@UnusedImport
import sys      #@UnusedImport
import h5py
import numpy

from . import utils


[docs]class Hdf5TreeView(object):
    """
    Describe the tree structure of any HDF5 file
    
    Example usage showing default display::
    
        mc = Hdf5TreeView(filename)
        mc.array_items_shown = 5
        show_attributes = False
        txt = mc.report(show_attributes)
    """
    requested_filename = None
    isNeXus = False
    array_items_shown = 5

    def __init__(self, filename):
        """store filename and test if file is NeXus HDF5"""
        self.requested_filename = filename
        self.filename = None
        self.show_attributes = True
        if os.path.exists(filename):
            self.filename = filename
            self.isNeXus = utils.isNeXusFile(filename)

[docs]    def report(self, show_attributes=True):
        """
        return the structure of the HDF5 file in a list of strings
        
        The work of parsing the datafile is done in this method.
        """
        if self.filename is None: return None
        self.show_attributes = show_attributes
        f = h5py.File(self.filename, 'r')
        txt = self.filename
        if self.isNeXus:
            txt += " : NeXus data file"
        tree_string_list = self._renderGroup(f, txt, indentation = "")
        f.close()
        return tree_string_list

    def _renderGroup(self, obj, name, indentation = "  "):
        """return a [formatted_string] with the contents of the group"""
        s = []
        nxclass = obj.attrs.get('NX_class', '')
        if len(nxclass) > 0:
            if isinstance(nxclass, numpy.ndarray):      # attribute reported as DATATYPE SIMPLE
                nxclass = nxclass[0]                    # convert as if DATATYPE SCALAR
            nxclass = ":" + utils.decode_byte_string(nxclass)
        s += [ indentation + name + nxclass ]
        s += self._renderAttributes(obj, indentation)
        # show datasets and links next
        groups = []
        for itemname in sorted(obj):
            linkref  = obj.get(itemname, getlink=True)
            try:
                # this will fail for external links if file is not available
                classref = obj.get(itemname, getclass=True)
            except KeyError:
                classref = None

            if classref is None:
                s += [ '%s  %s: external file missing' % (indentation, itemname) ]
                fmt = '%s    %s = %s'
                s += [ fmt % (indentation, '@file', utils.decode_byte_string(linkref.filename)) ]
                s += [ fmt % (indentation, '@path', utils.decode_byte_string(linkref.path)) ]
            else:
                value = obj.get(itemname)
                if utils.isNeXusLink(value):
                    s += self._renderLinkedObject(value, itemname, indentation+"  ")
                elif utils.isHdf5Group(value) or utils.isHdf5FileObject(value):
                    groups.append(value)
                    # TODO: report external group links in the right place
                    # The problem is the link file and path need to be fed into the
                    # next call to _renderGroup().  No such design exists now for that. 
                elif utils.isHdf5Dataset(value):
                    s += self._renderDataset(value, itemname, indentation+"  ")
                    if utils.isHdf5ExternalLink(obj, linkref):      # TODO: is obj the "parent"
                        # When "classref" is defined, then external data is available
                        fmt = '%s    %s = %s'
                        s += [ fmt % (indentation, '@file', utils.decode_byte_string(linkref.filename)) ]
                        s += [ fmt % (indentation, '@path', utils.decode_byte_string(linkref.path)) ]
                else:
                    msg = "unidentified %s: %s, %s", itemname, repr(classref), repr(linkref)
                    raise Exception(msg)

        for value in groups:        # show things that look like groups
            itemname = value.name.split("/")[-1]
            s += self._renderGroup(value, itemname, indentation+"  ")
        
        return s

    def _renderAttributes(self, obj, indentation = "  "):
        """return a [formatted_string] with any attributes"""
        s = []
        if self.show_attributes:
            for name, value in obj.attrs.items():       # FIXME: for name, value in obj.attrs.iteritems():
                s.append("%s  @%s = %s" % (indentation, name, utils.decode_byte_string(value)))
        return s

    def _renderLinkedObject(self, obj, name, indentation = "  "):
        """return a [formatted_string] with the name and target of a NeXus linked object"""
        s = []
        s.append("%s%s --> %s" % (indentation, name, obj.attrs['target']))
        return s

    def _renderDataset(self, dset, name, indentation = "  "):
        """return a [formatted_string] with the contents and structure of a dataset"""
        shape = dset.shape
        # dset.dtype.kind == 'S', nchar = dset.dtype.itemsize
        if self.isNeXus:
            if "target" in dset.attrs:
                if dset.attrs['target'] != dset.name:
                    return ["%s%s --> %s" % (indentation, name, utils.decode_byte_string(dset.attrs['target']))]
        txType = self._renderDsType(dset)
        txShape = self._renderDsShape(dset)
        s = []
        if dset.dtype.kind == 'S':
            if isinstance(dset.value, numpy.ndarray):
                value = " = %s" % utils.decode_byte_string(dset.value[0])
            else:
                value = " = %s" % utils.decode_byte_string(dset.value)
            s += [ "%s%s:%s%s" % (indentation, name, txType, value) ]
            s += self._renderAttributes(dset, indentation)
            # dset.dtype.kind == 'S', nchar = dset.dtype.itemsize
        elif dset.dtype.kind == 'O':
            value = " = %s" % str(dset.value)
            s += [ "%s%s:%s%s" % (indentation, name, txType, value) ]
            s += self._renderAttributes(dset, indentation)
        elif shape == (1,):
            value = " = %s" % str(dset[0])
            s += [ "%s%s:%s%s%s" % (indentation, name, txType, 
                                   txShape, value) ]
            s += self._renderAttributes(dset, indentation)
        else:

            if self.array_items_shown > 2:
                value = self._renderArray(dset, indentation + '  ')
                if len(dset.shape) < 2:
                    # show the array inline with the field
                    s += [ "%s%s:%s%s = %s" % (
                            indentation, name, txType, txShape, utils.decode_byte_string(value)) ]
                else:
                    # show multi-D arrays different
                    s += [ "%s%s:%s%s = __array" % (
                            indentation, name, txType, txShape) ]
                    s += [ "%s  %s = %s" % (indentation, "__array", utils.decode_byte_string(value)) ]
            else:
                s += [ "%s%s:%s%s = [ ... ]" % (
                        indentation, name, txType, txShape) ]

            # show these after __array
            s += self._renderAttributes(dset, indentation)
        return s

    def _renderDsType(self, obj):
        """get the storage (data) type of the dataset"""
        t = str(obj.dtype)
        # dset.dtype.kind == 'S', nchar = dset.dtype.itemsize
        if obj.dtype.kind == 'S':        # fixed-length string
            t = 'char[%s]' % obj.dtype.itemsize
        elif obj.dtype.kind == 'O':      # variable-length string
            t = 'CHAR'
        if self.isNeXus:
            t = 'NX_' + t.upper()
        return t

    def _renderDsShape(self, obj):
        """return the shape of the HDF5 dataset"""
        s = obj.shape
        l = []
        for dim in s:
            l.append(str(dim))
        if l == ['1']:
            result = ""
        else:
            result = "[%s]" % ",".join(l)
        return result

    def _renderArray(self, obj, indentation = '  '):
        """nicely format an array up to arbitrary rank"""
        shape = obj.shape
        r = ""
        if len(shape) > 0:
            r = self._renderNdArray(obj, indentation + '  ')
        return r

    def _decideNumShown(self, n):
        """determine how many values to show"""
        if self.array_items_shown != None:
            if n > self.array_items_shown:
                n = self.array_items_shown - 2
        return n

    def _renderNdArray(self, obj, indentation = '  '):
        """return a list of lower-dimension arrays, nicely formatted"""
        
        def __render(obj, rank, key, indents):
            if rank == 1:
                item = obj[key]
            elif rank < 4:
                # this replaces a lot of code: if rank == ...
                indices = ', '.join( [str(key)] + (':'*(rank-1)).split() )
                part = eval('obj[%s]' % indices)
                item = self._renderNdArray(part, indents + '  ')    # recursion
            else:
                item = "rank=%d" % (rank-1)
                
            return item

        shape = obj.shape
        rank = len(shape)
        if rank < 1: return None
        n = self._decideNumShown( shape[0] )
        r = []
        for i in range(n):
            r.append( __render(obj, rank, i, indentation + '  ') )
        if n < shape[0]:
            r.append("...")    # skip over most
            r.append( __render(obj, rank, -1, indentation + '  ') ) # last one

        if rank == 1:
            s = str( r )
        else:
            s = "[\n" + indentation + '  '
            s += ("\n" + indentation + '  ').join( r )
            s += "\n" + indentation + "]"
        return s