Source code for punx.schema_manager

#!/usr/bin/env python
# -*- coding: utf-8 -*-

#-----------------------------------------------------------------------------
# :author:    Pete R. Jemian
# :email:     prjemian@gmail.com
# :copyright: (c) 2016-2017, Pete R. Jemian
#
# Distributed under the terms of the Creative Commons Attribution 4.0 International Public License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
#-----------------------------------------------------------------------------

"""
manages the XML Schema of this project

The *schema_manager* calls the *cache_manager* and
is called by *nxdl_manager*.

Public

.. autosummary::
   
   ~SchemaManager
   ~Schema_Root
   ~Schema_Attribute 
   ~Schema_Element 
   ~Schema_Type 
   ~get_default_schema_manager
   ~raise_error
   ~strip_ns

Internal

.. autosummary::
   
   ~_Mixin
   ~_GroupParsing
   ~_Recursion

"""


from __future__ import print_function

import lxml.etree
import os
from . import NAMESPACE_DICT, FileNotFound, InvalidNxdlFile
from . import singletons
from . import utils


logger = utils.setup_logger(__name__)


[docs]def strip_ns(ref):
    """
    strip the namespace prefix from ``ref``
    
    :param str ref: one word, colon delimited string, such as *nx:groupGroup*
    :returns str: the part to the right of the last colon
    """
    return ref.split(':')[-1]


[docs]def raise_error(node, text, obj):
    """
    standard *ValueError* exception handling
    
    :param obj node: instance of 
    :param str text: label for ``obj``
    :param str obj: value
    """
    msg = 'line ' + str(node.sourceline)
    msg += ': ' + text + str(obj)
    raise ValueError(msg)


[docs]def get_default_schema_manager():
    """
    internal: convenience function
    """
    from punx import cache_manager
    cm = cache_manager.CacheManager()
    assert(cm is not None)
    assert(cm.default_file_set is not None)
    return cm.default_file_set.schema_manager


[docs]class SchemaManager(object):
    
    """
    describes the XML Schema for the NeXus NXDL definitions files
    """
    
    ns = NAMESPACE_DICT
    
    def __init__(self, path=None):
        from punx import cache_manager
        if path is None:
            cm = cache_manager.CacheManager()
            if cm is None or cm.default_file_set is None:
                raise ValueError('Could not get NXDL file set from the cache')
            path = cm.default_file_set.path
        schema_file = os.path.join(path, 'nxdl.xsd')
        if not os.path.exists(schema_file):
            raise FileNotFound(schema_file)
        
        self.schema_file = schema_file
        if not os.path.exists(self.schema_file):
            raise FileNotFound('XML Schema file: ' + self.schema_file)
        
        self.lxml_tree = lxml.etree.parse(self.schema_file)
        self.lxml_schema = lxml.etree.XMLSchema(self.lxml_tree)
        self.lxml_root = self.lxml_tree.getroot()
        
        nodes = self.lxml_root.xpath('xs:element', namespaces=self.ns)
        if len(nodes) != 1:
            raise InvalidNxdlFile(self.schema_file)
        self.nxdl = Schema_Root(
            nodes[0], 
            ns_dict=self.ns, 
            schema_root=self.lxml_root,
            schema_manager=self)
        
        # cleanup these internal structures
        del self.lxml_root
        #del self.lxml_schema    # needed for XML file validation
        del self.lxml_tree

[docs]    def parse_nxdl_patterns(self):
        """
        get regexp patterns for validItemName, validNXClassName, & validTargetName from nxdl.xsd
        """
        db = {}
        for node in self.lxml_root.xpath('xs:simpleType', namespaces=self.ns):
            key = node.attrib['name']
            if key.startswith('valid'):
                obj = Schema_pattern()
                obj.pattern_name = key
                db[key] = obj

                subnodes = node.xpath('xs:restriction', namespaces=self.ns)
                assert(len(subnodes) == 1)
                obj.base = strip_ns(subnodes[0].attrib['base'])
                
                for item in subnodes[0]:
                    if isinstance(item, lxml.etree._Comment):
                        pass
                    elif item.tag.endswith('}pattern'):
                        obj.re_list.append(item.attrib['value'])
                    elif item.tag.endswith('}maxLength'):
                        obj.maxLength = int(item.attrib['value'])
        
        # adjust for any restrictions with NeXus base
        for v in db.values():
            if v.base != 'token' and v.base in db:
                base = db[v.base]
                v.base = base.base
                v.maxLength = base.maxLength
                v.re_list += base.re_list

        return db

[docs]    def parse_nxdlTypes(self):
        """
        get the allowed data types and unit types from nxdlTypes.xsd
        """
        if os.path.exists(self.schema_file):
            path = os.path.dirname(self.schema_file)
        else:
            from punx import cache_manager
            cm = cache_manager.CacheManager()
            if cm is None or cm.default_file_set is None:
                raise ValueError('Could not get NXDL file set from the cache')
            path = cm.default_file_set.path

        self.types_file = os.path.join(path, 'nxdlTypes.xsd')
        if not os.path.exists(self.types_file):
            raise FileNotFound(self.types_file)
        lxml_types_tree = lxml.etree.parse(self.types_file)

        db = {}
        root = lxml_types_tree.getroot()
        for node in root:
            if isinstance(node, lxml.etree._Comment):
                pass
            elif node.tag.endswith('}annotation'):
                pass
            else:
                obj = Schema_nxdlType(node, ns_dict=self.ns, schema_root=root)
                if obj.name is not None:
                    db[obj.name] = obj

        # re-arrange
        units = list(db['anyUnitsAttr'].values or [])
        del db['anyUnitsAttr']
        del db['primitiveType']
        
        return db, units


[docs]class Schema_pattern(object):
    
    """
    describe the regular expression patterns ofr names of NeXus things
    """
    
    def __init__(self):
        self.base = 'token'
        self.pattern_name = None
        self.re_list = []
        self.maxLength = -1  # unlimited


[docs]class Schema_nxdlType(object):
    
    """
    one of the types defined in the file *nxdlTypes.xsd*
    """
    
    def __init__(self, xml_obj, ns_dict=None, schema_root=None):
        self.name = xml_obj.attrib.get('name')
        self.restriction = None
        self.union = None
        self.values = None
        self.schema_root = schema_root
        self.attrs = {}
        
        for node in xml_obj:
            if isinstance(node, lxml.etree._Comment):
                pass
            elif node.tag.endswith('}annotation'):
                pass
            elif node.tag.endswith('}list'):
                self.values = map(strip_ns, [node.attrib['itemType'],])
            elif node.tag.endswith('}restriction'):
                self.restriction = strip_ns(node.attrib['base'])
                self.values = []
                for subnode in node:
                    if isinstance(subnode, lxml.etree._Comment):
                        pass
                    elif subnode.tag.endswith('}enumeration'):
                        self.values.append(subnode.attrib['value'])
            elif node.tag.endswith('}union'):
                self.union = map(strip_ns, node.attrib['memberTypes'].split())
            else:
                raise_error(node, 'unhandled tag=', node.tag)


class _Mixin(object):
    
    """
    common code for NXDL Rules classes below
    
    :param lxml.etree.Element xml_obj: XML element
    :param str obj_name: optional, default taken from ``xml_obj``
    :param dict ns_dict: optional, default taken from :data:`__init__.NAMESPACE_DICT`
    :param obj schema_root: optional, instance of lxml.etree._Element
    """
    
    def __init__(self, xml_obj, obj_name=None, ns_dict=None, schema_root=None):
        self.name = obj_name or xml_obj.attrib.get('name')
        self.ns = ns_dict or NAMESPACE_DICT
        self.lxml_root = schema_root
    
    def get_named_node(self, tag, attribute, value):
        """
        return a named node from the XML Schema
        
        :param str tag: XML Schema tag (such as "complexType") to match
        :param str attribute: attribute name to match
        :param str value: attribute value to match
        """
        if self.lxml_root is None:
            raise ValueError
        root = self.lxml_root
        xpath_str = 'xs:' + tag
        xpath_str += '[@' + attribute
        xpath_str += '="' + value + '"]'
        node_list = root.xpath(xpath_str, namespaces=self.ns)
        if len(node_list) != 1:
            msg = 'wrong number of ' + tag
            msg += ' nodes found: ' + str(len(node_list))
            raise ValueError(msg)
        return node_list[0]
    
    def copy_to(self, target):
        """
        copy results into target object
        
        :param obj target: instance of _Mixin, such as Schema_Element
        """
        for k, v in self.attrs.items():
            target.attrs[k] = v
        for k, v in self.children.items():
            target.children[k] = v

    def parse_attribute(self, node):
        """ """
        obj = Schema_Attribute(node, schema_root=self.lxml_root)
        self.attrs[obj.name] = obj

    def parse_attributeGroup(self, node):
        """ """
        obj = Schema_Type(node.attrib.get('ref'), schema_root=self.lxml_root)
        obj.copy_to(self)

    def parse_complexContent(self, node):
        """ """
        for subnode in node:
            if subnode.tag.endswith('}extension'):
                ref = subnode.attrib.get('base')
                if ref not in ('nx:basicComponent'):
                    raise_error(subnode, 'unexpected base=', ref)
                obj = Schema_Type(ref, schema_root=self.lxml_root)
                obj.copy_to(self)

                # parse children of extension node
                for obj_node in subnode:
                    if obj_node.tag.endswith('}annotation'):
                        pass
                    elif obj_node.tag.endswith('}attribute'):
                        self.parse_attribute(obj_node)
                    elif obj_node.tag.endswith('}sequence'):
                        self.parse_sequence(obj_node)
                    else:
                        raise_error(obj_node, 'unexpected base=', obj_node.tag)

            else:
                raise_error(subnode, 'unexpected tag=', subnode.tag)

    def parse_group(self, node):
        """ """
        obj = Schema_Type(node.attrib.get('ref'), schema_root=self.lxml_root)
        obj.copy_to(self)


[docs]class Schema_Root(_Mixin):
    
    """
    root element of the nxdl.xsd file
    
    :param lxml.etree.Element xml_obj: XML element
    :param str obj_name: optional, default taken from ``xml_obj``
    :param dict ns_dict: optional, default taken from :data:`NAMESPACE_DICT`
    :param obj schema_root: optional, instance of lxml.etree._Element
    """

    attrs = {}
    children = {}
    patterns = None
    type = None
    units = None
    
    def __init__(self, element_node, obj_name=None, ns_dict=None, schema_root=None, schema_manager=None):
        _Mixin.__init__(
            self, 
            element_node, 
            obj_name=obj_name, 
            ns_dict=ns_dict, 
            schema_root=schema_root)

        self.schema_manager = schema_manager
        element_type = element_node.attrib.get('type')
        if element_type is None:
            element_name = element_node.attrib.get('name')
            raise_error(element_node, 'no @type for element node: ', element_name)
        
        ref = strip_ns(element_type)
        type_node = self.get_named_node('complexType', 'name', ref)
        
        for node in type_node:
            if node.tag.endswith('}attribute'):
                obj = Schema_Attribute(node, schema_root=self.lxml_root)
                self.attrs[obj.name] = obj
            elif node.tag.endswith('}attributeGroup'):
                self.parse_attributeGroup(node)
            elif node.tag.endswith('}sequence'):
                self.parse_sequence(node)
            elif node.tag.endswith('}annotation'):
                pass
            else:
                raise_error(node, 'unhandled tag=', node.tag)

        if schema_manager is not None:
            self.types, self.units = schema_manager.parse_nxdlTypes()
            self.patterns = schema_manager.parse_nxdl_patterns()
            self.schema_types = dict(definition=self) # FIXME:
            self.schema_types.update(self.children)

[docs]    def parse_sequence(self, seq_node):
        """
        parse the sequence used in the root element
        """
        for node in seq_node:
            if node.tag.endswith('}element'):
                obj = Schema_Element(node, schema_root=self.lxml_root)
                self.children[obj.name] = obj
            elif node.tag.endswith('}group'):
                obj = Schema_Type(node.attrib.get('ref'), schema_root=self.lxml_root)
                obj.copy_to(self)
            else:
                msg = 'unhandled tag in ``definitionType``: '
                raise_error(node, msg, node.tag)


[docs]class Schema_Attribute(_Mixin):
    
    """
    xs:attribute element
    
    :param lxml.etree.Element xml_obj: XML element
    :param str obj_name: optional, default taken from ``xml_obj``
    :param dict ns_dict: optional, default taken from :data:`NAMESPACE_DICT`
    :param obj schema_root: optional, instance of lxml.etree._Element
    """
    
    def __init__(self, xml_obj, obj_name=None, ns_dict=None, schema_root=None):
        assert(xml_obj is not None)
        assert(xml_obj.tag == '{'+xml_obj.nsmap['xs']+'}attribute')

        _Mixin.__init__(
            self, 
            xml_obj, 
            obj_name=obj_name, 
            ns_dict=ns_dict,
            schema_root=schema_root)

        use = xml_obj.attrib.get('use', 'optional')
        self.required = use in ('required', )

        self.type = xml_obj.attrib.get('type', 'str')
        defalt = xml_obj.attrib.get('default')
        if self.type in ('nx:NX_BOOLEAN',):
            self.default_value = defalt.lower() in ('true', 'y', 1)
        else:
            self.default_value = defalt

        self.enum = []
        xpath_str = 'xs:simpleType/xs:restriction/xs:enumeration'
        for node in xml_obj.xpath(xpath_str, namespaces=self.ns):
            v = node.attrib.get('value')
            if v is not None:
                self.enum.append(v)

        self.patterns = []
        xpath_str = 'xs:simpleType/xs:restriction/xs:pattern'
        for node in xml_obj.xpath(xpath_str, namespaces=self.ns):
            v = node.attrib.get('value')
            if v is not None:
                self.patterns.append(v)
    
    def __str__(self, *args, **kwargs):
        try:
            s = '@' + self.name
            s += ' : ' + self.type
            if len(self.enum):
                s += ' = '
                s += ' | '.join(self.enum)
            return s
        except Exception:
            return _Mixin.__str__(self, *args, **kwargs)


[docs]class Schema_Element(_Mixin):
    
    """
    xs:element
    
    :param lxml.etree.Element xml_obj: XML element
    :param str obj_name: optional, default taken from ``xml_obj``
    :param dict ns_dict: optional, default taken from :data:`NAMESPACE_DICT`
    :param obj schema_root: optional, instance of lxml.etree._Element
    
    :see: http://download.nexusformat.org/doc/html/nxdl.html
    :see: http://download.nexusformat.org/doc/html/nxdl_desc.html#nxdl-elements
    """
    
    def __init__(self, xml_obj, obj_name=None, ns_dict=None, schema_root=None):
        _Mixin.__init__(
            self, 
            xml_obj, 
            obj_name=obj_name, 
            ns_dict=ns_dict,
            schema_root=schema_root)
        self.children = {}
        self.attrs = {}

        # read & analyze theNXDL structural *type* referenced by *ref*
        ref = self.type = xml_obj.attrib.get('type')
        if ref is None:
            for node in xml_obj:
                if node.tag.endswith('}complexType'):
                    a = Schema_Attribute(node.find('xs:attribute', self.ns), schema_root=self.lxml_root)
                    self.attrs[a.name] = a
                elif node.tag.endswith('}annotation'):
                    pass
                else:
                    raise_error(node, 'unhandled tag=', node.tag)
        else:
            # avoid known infinite recursion: group may contain group(s)
            ok_to_parse = True
            if xml_obj.attrib['name'] == 'group' and xml_obj.attrib['type'] == 'nx:groupType':
                if _GroupParsing().started:
                    ok_to_parse = False
                    # needs a special code to apply this rule
                    #     isinstance(obj, _Recursion)
                    self.children['group'] = _Recursion('group')
                _GroupParsing().started = True
            if ok_to_parse:
                type_obj = Schema_Type(ref, schema_root=self.lxml_root)
                type_obj.copy_to(self)


[docs]class Schema_Type(_Mixin):
    
    """
    a named NXDL structure type (such as groupGroup)
    
    :param str ref: name of NXDL structure type (such as ``groupGroup``)
    :param str tag: XML Schema element tag, such as complexType (default=``*``)
    :param obj schema_root: optional, instance of lxml.etree._Element
    
    :see: http://download.nexusformat.org/doc/html/nxdl.html
    :see: http://download.nexusformat.org/doc/html/nxdl_desc.html#nxdl-data-types-internal
    """
    
    def __init__(self, ref, tag = '*', schema_root=None):
        # _Mixin.__init__(self, xml_obj)
        # do the _Mixin.__init__ directly here
        self.ns = NAMESPACE_DICT
        self.lxml_root = schema_root

        xml_obj = self.get_named_node(tag, 'name', strip_ns(ref))
        self.name = xml_obj.attrib.get('name')
        
        self.attrs = {}
        self.children = {}

        for node in xml_obj:
            if isinstance(node, lxml.etree._Comment):
                pass
            elif node.tag.endswith('}annotation'):
                pass
            elif node.tag.endswith('}attribute'):
                self.parse_attribute(node)
            elif node.tag.endswith('}attributeGroup'):
                self.parse_attributeGroup(node)
            elif node.tag.endswith('}complexContent'):
                self.parse_complexContent(node)
            elif node.tag.endswith('}group'):
                self.parse_group(node)
            elif node.tag.endswith('}sequence'):
                self.parse_sequence(node)
            else:
                raise_error(node, 'unexpected tag=', node.tag)

[docs]    def parse_sequence(self, node):
        """ """
        for subnode in node:
            if subnode.tag.endswith('}element'):
                obj = Schema_Element(subnode, schema_root=self.lxml_root)
                self.children[obj.name] = obj
            elif subnode.tag.endswith('}group'):
                obj = Schema_Element(subnode, schema_root=self.lxml_root)
                self.children[obj.name] = obj
            elif subnode.tag.endswith('}any'):
                # do not process this one, only used for documentation
                pass
            else:
                raise_error(subnode, 'unexpected tag=', subnode.tag)


class _GroupParsing(singletons.Singleton):
    
    """
    internal: avoid a known recursion of group in a group
    """
    
    started = False


class _Recursion(_Mixin):
    
    """
    internal: an element used in recursion, such as child group of group
    
    :param str obj_name: optional, default taken from ``xml_obj``
    """
    
    def __init__(self, obj_name):
        _Mixin.__init__(self, None, obj_name=obj_name, ns_dict=None)


# if __name__ == '__main__':
#     sm = SchemaManager()
#     _breakpoint = True