Source code for _delb.nodes

# Copyright (C) 2018-'25  Frank Sachsenheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations

import warnings
from collections.abc import Iterable, Iterator, Mapping, MutableMapping
from itertools import chain
from typing import (
    TYPE_CHECKING,
    cast,
    overload,
    Any,
    Final,
    Literal,
    NamedTuple,
    Optional,
)

from _delb.exceptions import AmbiguousTreeError, InvalidCodePath, InvalidOperation
from _delb.filters import altered_default_filters, default_filters, is_tag_node
from _delb.grammar import _is_xml_char, _is_xml_name
from _delb.names import (
    XML_NAMESPACE,
    deconstruct_clark_notation,
    Namespaces,
)
from _delb.serializer import (
    DefaultStringOptions,
    FormatOptions,
    _StringWriter,
    _get_serializer,
)
from _delb.utils import (
    _StringMixin,
    _crunch_whitespace,
    last,
)
from _delb.typing import (
    CommentNodeType,
    _DocumentNodeType,
    ParentNodeType,
    ProcessingInstructionNodeType,
    Self,
    TagNodeType,
    TextNodeType,
    XMLNodeType,
)
from _delb.xpath import QueryResults, _css_to_xpath
from _delb.xpath import evaluate as evaluate_xpath, parse as parse_xpath
from _delb.xpath.ast import NameMatchTest, XPathExpression

if TYPE_CHECKING:
    from delb import Document
    from _delb.typing import (
        AttributeAccessor,
        _AttributesData,
        Filter,
        NamespaceDeclarations,
        NodeSource,
        QualifiedName,
    )


# constants


ATTRIBUTE_ACCESSOR_MSG: Final = (
    "An attribute name must be provided as string (either a local name or a "
    "universal in Clark notation) or as namespace and local name packed in a tuple."
)


# functions


def new_comment_node(content: str) -> CommentNode:  # pragma: no cover
    """
    Deprecated. Use :class:`CommentNode` directly.
    """
    warnings.warn(
        "This function is deprecated. Use CommentNode directly.",
        category=DeprecationWarning,
    )
    return CommentNode(content)


def new_processing_instruction_node(  # pragma: no cover
    target: str, content: str
) -> ProcessingInstructionNode:
    """Deprecated. Use :class:`ProcessingInstructionsNode` directly."""
    warnings.warn(
        "This function is deprecated. Use ProcessingInstructionsNode directly.",
        category=DeprecationWarning,
    )
    return ProcessingInstructionNode(target, content)


def new_tag_node(  # pragma: no cover
    local_name: str,
    attributes: Optional[
        _AttributesData | dict[AttributeAccessor, str] | TagAttributes
    ] = None,
    namespace: Optional[str] = None,
    children: Iterable[NodeSource] = (),
) -> TagNode:
    """
    Deprecated. Use :class:`TagNode` directly.
    """
    warnings.warn(
        "This function is deprecated. Use TagNode directly to instantiate new tag "
        "nodes.",
        category=DeprecationWarning,
    )
    return TagNode(
        local_name=local_name,
        attributes=attributes,
        namespace=namespace,
        children=children,
    )


def _reduce_whitespace_between_siblings(nodes: list[XMLNodeType] | Siblings):
    if not (
        text_nodes := tuple(
            (i, n) for i, n in enumerate(nodes) if isinstance(n, TextNode)
        )
    ):
        return

    in_tree = isinstance(nodes, Siblings)
    first_node = nodes[0]
    last_node = nodes[-1]
    empty_nodes = []

    for i, text_node in text_nodes:
        if reduced_content := _reduce_whitespace_content(
            text_node.content,
            text_node is first_node,
            text_node is last_node,
        ):
            text_node.content = reduced_content
        else:
            if in_tree:
                text_node.detach()
            else:
                empty_nodes.append(i)

    if empty_nodes:
        assert isinstance(nodes, list)
        for i in reversed(empty_nodes):
            del nodes[i]


def _reduce_whitespace_content(content: str, is_first: bool, is_last: bool) -> str:
    collapsed = _crunch_whitespace(content)
    collapsed_and_stripped = collapsed.strip()
    has_non_whitespace_content = bool(collapsed_and_stripped)
    has_trailing_whitespace = collapsed.endswith(" ")

    # 1 Retain one leading space
    #   if the node isn't first, has non-space content, and has leading space.
    if not is_first and has_non_whitespace_content and collapsed.startswith(" "):
        result = f" {collapsed_and_stripped}"
    else:
        result = collapsed_and_stripped

    # Retain one trailing space
    if (
        # 2 … if the node isn't last, isn't first, and has trailing space.
        (not (is_last or is_first) and has_trailing_whitespace)
        or
        # 3 … if the node isn't last, is first, has trailing space, and has
        #   non-space content.
        (
            not is_last
            and is_first
            and has_trailing_whitespace
            and has_non_whitespace_content
        )
        or
        # 4 … if the node is an only child and only has space content.
        (is_first and is_last and not has_non_whitespace_content)
    ):
        result += " "

    return result


# abstract tag definitions


[docs] class _TagDefinition(NamedTuple): """ Instances of this class describe tag nodes that are constructed from the context they are used in (commonly additions to a tree) and the properties that this description holds. For the sake of slick code they are not instantiated directly, but with the :func:`delb.tag` function. """ local_name: str attributes: Optional[dict[AttributeAccessor, str]] = None children: tuple[NodeSource, ...] = ()
# attributes
[docs] class Attribute(_StringMixin): """ Attribute objects represent a tag node's attributes. See the :meth:`TagNode.attributes` documentation for capabilities. """ __slots__ = ("_attributes", "__qualified_name", "__value") def __init__(self, qualified_name: QualifiedName, value: str): self._attributes: TagAttributes | None = None self.__qualified_name = qualified_name self.value = value def __repr__(self): return ( f'<{self.__class__.__name__}({self.universal_name}="{self.value}")' f" [{hex(id(self))}]>" ) def __str__(self): return self.__value def __set_new_key(self, namespace: str, name: str): assert self.__qualified_name != (namespace, name) if (attributes := self._attributes) is not None: if __debug__: assert attributes.pop(self.__qualified_name) is self else: del attributes[self.__qualified_name] attributes[(namespace, name)] = self self.__qualified_name = (namespace, name) @property def local_name(self) -> str: """The attribute's local name.""" return self.__qualified_name[1] @local_name.setter def local_name(self, name: str): if not _is_xml_name(name): raise ValueError(f"`{name}` is not a valid xml name.") self.__set_new_key(self.namespace, name) @property def namespace(self) -> str: """The attribute's namespace""" return self.__qualified_name[0] @namespace.setter def namespace(self, namespace: str): # TODO see https://github.com/delb-xml/delb-py/issues/69 if namespace and not _is_xml_char(namespace): raise ValueError("Invalid XML character data.") self.__set_new_key(namespace, self.local_name) @property def universal_name(self) -> str: """ The attribute's namespace and local name in `Clark notation`_. .. _Clark notation: http://www.jclark.com/xml/xmlns.htm """ if namespace := self.namespace: return f"{{{namespace}}}{self.local_name}" else: return self.local_name @property def value(self) -> str: """The attribute's value.""" return self.__value @value.setter def value(self, value: str): if not isinstance(value, str): raise TypeError if value and not _is_xml_char(value): raise ValueError("Invalid XML character data.") self.__value = value
class TagAttributes(MutableMapping): """ A data type to access a tag node's attributes. """ __slots__ = ( "__data", "__node", ) def __init__( self, data: _AttributesData | dict[AttributeAccessor, str] | TagAttributes, node: TagNodeType, ): if not isinstance(data, Mapping): raise TypeError self.__data: dict[QualifiedName, Attribute] = {} self.__node = node self.update(data) def __contains__(self, item: Any) -> bool: return self.__resolve_accessor(item) in self.__data def __delitem__(self, item: AttributeAccessor): name = self.__resolve_accessor(item) self.__data[name]._attributes = None del self.__data[name] def __eq__(self, other: Any) -> bool: if not isinstance(other, Mapping): return False if len(self) != len(other): return False if isinstance(other, TagAttributes): return self.__data == other.__data return self.__data == {self.__resolve_accessor(k): v for k, v in other.items()} def __getitem__(self, item: AttributeAccessor) -> Attribute: return self.__data[self.__resolve_accessor(item)] def __iter__(self) -> Iterator[QualifiedName]: return iter(self.__data) def __len__(self) -> int: return len(self.__data) def __setitem__(self, item: AttributeAccessor, value: str | Attribute): name = self.__resolve_accessor(item) match value: case Attribute(): if value._attributes is None: attribute = value else: attribute = Attribute(name, value.value) case str(): attribute = Attribute(name, value) case _: raise TypeError assert attribute._attributes in (self, None) attribute._attributes = self self.__data[name] = attribute def __str__(self): return str(self.as_dict_with_strings()) __repr__ = __str__ def __resolve_accessor(self, item: AttributeAccessor) -> QualifiedName: match item: case tuple(): assert item[0] is not None return item case str(): namespace, name = deconstruct_clark_notation(item) if namespace is None: return (self.__node.namespace, name) else: return (namespace, name) case _: raise TypeError(ATTRIBUTE_ACCESSOR_MSG) def as_dict_with_strings(self) -> dict[str, str]: """Returns the attributes as :class:`str` instances in a :class:`dict`.""" return {a.universal_name: a.value for a in self.values()} # containers class Siblings: """ Container for the sisterhood of nodes. Everyone's taken care of. """ __slots__ = ( "__belongs_to", "__data", ) def __init__( self, belongs_to: None | _ParentNode, nodes: Optional[Iterable[NodeSource]], ): self.__data: Final[list[XMLNodeType]] = [] self.__belongs_to: Final = belongs_to if nodes is not None: for node in nodes: self.__data.append(self._handle_new_sibling(node)) @overload def __getitem__(self, index: int) -> XMLNodeType: pass @overload def __getitem__(self, index: slice) -> list[XMLNodeType]: pass def __getitem__(self, index: int | slice) -> XMLNodeType | list[XMLNodeType]: if not isinstance(index, (int, slice)): raise TypeError return self.__data[index] def __iter__(self) -> Iterator[XMLNodeType]: return iter(self.__data) def __len__(self) -> int: return len(self.__data) def append(self, node: NodeSource) -> XMLNodeType: result = self._handle_new_sibling(node) self.__data.append(result) return result def clear(self): for node in self.__data: node._parent = None self.__data.clear() def index(self, node: XMLNodeType) -> int: for result, n in enumerate(self.__data): if n is node: return result else: raise IndexError def insert(self, index: int, node: NodeSource) -> XMLNodeType: result = self._handle_new_sibling(node) self.__data.insert(index, result) return result def remove(self, node: XMLNodeType): node._parent = None del self.__data[self.index(node)] def _handle_new_sibling(self, node: NodeSource) -> XMLNodeType: if isinstance(self.__belongs_to, _DocumentNode): if isinstance(node, (str, _TagDefinition)): raise TypeError if isinstance(node, TagNode) and any( isinstance(n, TagNode) for n in self.__data ): raise InvalidCodePath match node: case str(): node = TextNode(node) case _TagDefinition(): assert isinstance(self.__belongs_to, TagNode) node = self.__belongs_to._new_tag_node_from_definition(node) case XMLNodeType(): if node._parent is not None: raise InvalidOperation( "Only a detached node can be added to the tree. Use " ":meth:`XMLNodeType.clone` or :meth:`XMLNodeType.detach` to " "get one." ) case _: raise TypeError( "Either node instances, strings or objects from :func:`delb.tag` " "must be provided as child node." ) node._parent = self.__belongs_to return node # nodes class _NodeCommons(XMLNodeType): __slots__ = ("_parent",) def __init__(self): self._parent = None def __copy__(self): return self.clone(deep=False) def __deepcopy__(self, memo): return self.clone(deep=True) def __str__(self) -> str: return self.serialize( format_options=DefaultStringOptions.format_options, namespaces=DefaultStringOptions.namespaces, newline=DefaultStringOptions.newline, ) def add_following_siblings( self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: if self._parent is None: raise InvalidOperation("Can't add sibling to a node without parent node.") return tuple( reversed( self._parent.insert_children( self._parent._child_nodes.index(self) + 1, *node, clone=clone ) ) ) def add_preceding_siblings( self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: if self._parent is None: raise InvalidOperation("Can't add sibling to a node without parent node.") return self._parent.insert_children( self._parent._child_nodes.index(self), *reversed(node), clone=clone ) @property def depth(self) -> int: result = 0 pointer: XMLNodeType | None = self assert pointer is not None while True: pointer = pointer._parent if pointer is None or isinstance(pointer, _DocumentNode): break result += 1 return result def detach(self, retain_child_nodes: bool = False) -> Self: if (parent := self._parent) is not None: parent._child_nodes.remove(self) return self def fetch_following(self, *filter: Filter) -> Optional[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_following(): if all(f(node) for f in all_filters): return node else: return None def _fetch_following(self) -> Optional[XMLNodeType]: for node in self._iterate_following(): return node else: return None def fetch_following_sibling(self, *filter: Filter) -> Optional[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_following_siblings(): if all(f(node) for f in all_filters): return node else: return None def _fetch_following_sibling(self) -> Optional[XMLNodeType]: if self._parent is None: return None if (siblings := self._parent._child_nodes)[-1] is self: return None return siblings[siblings.index(self) + 1] def fetch_preceding(self, *filter: Filter) -> Optional[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_preceding(): if all(f(node) for f in all_filters): return node else: return None def _fetch_preceding(self) -> Optional[XMLNodeType]: for node in self._iterate_preceding(): return node else: return None def fetch_preceding_sibling(self, *filter: Filter) -> Optional[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_preceding_siblings(): if all(f(node) for f in all_filters): return node else: return None def _fetch_preceding_sibling(self) -> Optional[XMLNodeType]: if self._parent is None: return None if (siblings := self._parent._child_nodes)[0] is self: return None return siblings[siblings.index(self) - 1] @property def index(self) -> Optional[int]: if self._parent is not None: for result, node in enumerate( n for n in self._parent._child_nodes if all(f(n) for f in default_filters[-1]) ): if node is self: return result return None def iterate_ancestors(self, *filter: Filter) -> Iterator[ParentNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_ancestors(): if all(f(node) for f in all_filters): yield node def _iterate_ancestors( self, *, _include_document_node: bool = False ) -> Iterator[ParentNodeType]: node: None | XMLNodeType = self assert node is not None if _include_document_node: while (node := node._parent) is not None: yield node else: while True: node = node._parent if node is None or isinstance(node, _DocumentNode): return yield node def iterate_following( self, *filter: Filter, include_descendants: bool = True ) -> Iterator[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_following(include_descendants=include_descendants): if all(f(node) for f in all_filters): yield node def _iterate_following( self, *, include_descendants: bool = True ) -> Iterator[XMLNodeType]: if include_descendants: yield from self._iterate_descendants() if self._parent is None: return for following_sibling in self._iterate_following_siblings(): yield following_sibling yield from following_sibling._iterate_descendants() for ancestor in self._iterate_ancestors(): if ( ancestors_following_sibling := ancestor._fetch_following_sibling() ) is not None: break else: return yield ancestors_following_sibling yield from ancestors_following_sibling._iterate_following( include_descendants=True ) def iterate_following_siblings(self, *filter: Filter) -> Iterator[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_following_siblings(): if all(f(node) for f in all_filters): yield node def _iterate_following_siblings(self) -> Iterator[XMLNodeType]: if self._parent is None: return siblings = self._parent._child_nodes for index in range(siblings.index(self) + 1, len(siblings)): yield siblings[index] def iterate_preceding( self, *filter: Filter, include_ancestors: bool = True ) -> Iterator[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_preceding(include_ancestors=include_ancestors): if all(f(node) for f in all_filters): yield node def _iterate_preceding( self, *, include_ancestors: bool = True ) -> Iterator[XMLNodeType]: if (parent := self._parent) is None: return for preceding_sibling in self._iterate_preceding_siblings(): yield from preceding_sibling._iterate_reversed_descendants() if isinstance(parent, _DocumentNode): return if include_ancestors: yield parent yield from parent._iterate_preceding(include_ancestors=include_ancestors) def iterate_preceding_siblings(self, *filter: Filter) -> Iterator[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._iterate_preceding_siblings(): if all(f(node) for f in all_filters): yield node def _iterate_preceding_siblings(self) -> Iterator[XMLNodeType]: if self._parent is None: return siblings = self._parent._child_nodes for index in range(siblings.index(self) - 1, -1, -1): yield siblings[index] def _iterate_reversed_descendants(self) -> Iterator[XMLNodeType]: if not isinstance(self, TagNode) or not self._child_nodes: yield self return stack = [(self, list(self._child_nodes))] while stack: parent, children = stack[-1] if children: node = children.pop() if isinstance(node, TagNode) and node._child_nodes: stack.append((node, list(node._child_nodes))) else: yield node else: stack.pop() yield parent @property def parent(self) -> Optional[ParentNodeType]: return None if isinstance(self._parent, _DocumentNode) else self._parent def replace_with(self, node: NodeSource, clone: bool = False) -> Self: if (parent := self._parent) is None: raise InvalidOperation( "Cannot replace a root node of a tree. Maybe you want to set the " "`root` property of a Document instance?" ) if clone and isinstance(node, _NodeCommons): node = node.clone(deep=True) parent._child_nodes.insert(parent._child_nodes.index(self), node) return self.detach(retain_child_nodes=False) def serialize( self, *, format_options: Optional[FormatOptions] = None, namespaces: Optional[NamespaceDeclarations] = None, newline: Optional[str] = None, ) -> str: serializer = _get_serializer( _StringWriter(newline=newline), format_options=format_options, namespaces=namespaces, ) serializer.serialize_node(self) return serializer.writer.result def xpath( self, expression: str, namespaces: Optional[NamespaceDeclarations] = None, ) -> QueryResults: return evaluate_xpath(node=self, expression=expression, namespaces=namespaces) class _LeafNode(_NodeCommons): """Node types using this mixin also can't be root nodes of a document.""" __slots__ = () first_child = None """ The node's first child. """ last_child = None """ The node's last child node. """ last_descendant = None """ The node's last descendant. """ def __len__(self): return 0 @property def document(self) -> Optional[Document]: if self._parent is None: return None else: return self._parent.document @property def full_text(self) -> str: return "" # the following yield statements are there to trick mypy def iterate_children(self, *filter: Filter) -> Iterator[XMLNodeType]: """ A :term:`generator iterator` that yields nothing. :meta category: Methods to iterate over related node """ return yield from () def iterate_descendants(self, *filter: Filter) -> Iterator[XMLNodeType]: """ A :term:`generator iterator` that yields nothing. :meta category: Methods to iterate over related node """ return yield from () def _iterate_descendants(self) -> Iterator[XMLNodeType]: return yield from () class _ParentNode(_NodeCommons, ParentNodeType): __slots__ = ("_child_nodes",) def __init__( self, children: Iterable[NodeSource] = (), ): super().__init__() self._child_nodes = Siblings(nodes=children, belongs_to=self) def __len__(self) -> int: result = 0 for node in self._child_nodes: if all(f(node) for f in default_filters[-1]): result += 1 return result def append_children( self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: if not node: return () result: list[XMLNodeType] = [] for _node in node: if clone and isinstance(_node, _NodeCommons): _node = _node.clone(deep=True) result.append(self._child_nodes.append(_node)) return tuple(result) @property def first_child(self) -> Optional[XMLNodeType]: for node in self._child_nodes: if all(f(node) for f in default_filters[-1]): return node else: return None @property def full_text(self) -> str: return "".join( n.content for n in self._iterate_descendants() if isinstance(n, TextNode) ) def insert_children( self, index: int, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: children_size = len(self._child_nodes) if not (children_size * -1 <= index <= children_size): raise IndexError result = [] for _node in reversed(node): if clone and isinstance(_node, _NodeCommons): _node = _node.clone(deep=True) result.append(self._child_nodes.insert(index, _node)) return tuple(result) def iterate_children(self, *filter: Filter) -> Iterator[XMLNodeType]: all_filters = default_filters[-1] + filter for node in self._child_nodes: if all(f(node) for f in all_filters): yield node def iterate_descendants(self, *filter: Filter) -> Iterator[XMLNodeType]: if not self._child_nodes: return all_filters = default_filters[-1] + filter for node in self._iterate_descendants(): if all(f(node) for f in all_filters): yield node def _iterate_descendants(self) -> Iterator[XMLNodeType]: stack = [(self._child_nodes, 0)] while stack: siblings, pointer = stack.pop() for node in siblings[pointer:]: pointer += 1 yield node if isinstance(node, TagNode) and node._child_nodes: stack.extend(((siblings, pointer), (node._child_nodes, 0))) break @property def last_child(self) -> Optional[XMLNodeType]: if self._child_nodes: filters = default_filters[-1] for node in self._child_nodes[::-1]: if all(f(node) for f in filters): return node return None @property def last_descendant(self) -> Optional[XMLNodeType]: for node in self._iterate_reversed_descendants(): if node is not self and all(f(node) for f in default_filters[-1]): return node else: return None def merge_text_nodes(self, deep: bool = False): empty_nodes: list[TextNodeType] = [] for index in range(len(self._child_nodes) - 1, -1, -1): node = self._child_nodes[index] if isinstance(node, TextNode): if not node.content: empty_nodes.append(node) elif index and isinstance( (preceding_node := self._child_nodes[index - 1]), TextNode ): preceding_node.content += node.content empty_nodes.append(node) for node in empty_nodes: node.content = "" node.detach() if deep: for node in (n for n in self._child_nodes if isinstance(n, TagNode)): node.merge_text_nodes(deep=True) def prepend_children( self, *node: XMLNodeType, clone: bool = False ) -> tuple[XMLNodeType, ...]: return self.insert_children(0, *node, clone=clone)
[docs] class CommentNode(_LeafNode, CommentNodeType): """ The instances of this class represent comment nodes of a tree. This class implements :class:`delb.typing.CommentNodeType`. :param content: The comment's content a.k.a. text. """ __slots__ = ("__content",) def __init__(self, content: str): super().__init__() self.content = content def __eq__(self, other) -> bool: return isinstance(other, CommentNode) and self.content == other.content def __repr__(self) -> str: return f'<{self.__class__.__name__}("{self.content}") [{hex(id(self))}]>' def __str__(self) -> str: return f"<!--{self.content}-->"
[docs] def clone(self, deep: bool = False) -> CommentNode: return CommentNode(self.__content)
@property def content(self) -> str: return self.__content @content.setter def content(self, value: str): if value and not _is_xml_char(value): raise ValueError("Invalid XML character data.") if "--" in value or value.endswith("-"): raise ValueError("Invalid Comment content.") self.__content = value
class _DocumentNode(_ParentNode, _DocumentNodeType): """ This node type is only supposed to facilitate tree traversal beyond a root node via its :attr:`_DocumentNode._child_nodes` attribute. Therefore it shall be only accessible by :attr:`XMLNodeType._parent` and yielded by :meth:`XMLNodeType._iterate_ancestors` if requested with an argument. It also holds information to the related :class:`Document` instance of a tree. In the context of XPath evaluations it acts like :class:`xpath.ast._DocumentNode` that is used as shim for queries that target trees that are not associated to a :class:`Document` instance. """ __slots__ = ("__document",) def __init__(self, document: Document | None, children: Iterable[XMLNodeType]): super().__init__(children) self.__document: Final = document def clone(self, deep: bool = False) -> XMLNodeType: # pragma: no cover raise InvalidCodePath @property def document(self) -> Document: assert self.__document is not None return self.__document def add_following_siblings( # pragma: no cover self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: raise InvalidCodePath def add_preceding_siblings( # pragma: no cover self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: raise InvalidCodePath def detach(self, retain_child_nodes: bool = False) -> Self: # pragma: no cover raise InvalidCodePath @property def _parent(self) -> None: return None @_parent.setter def _parent(self, value): # pragma: no cover if value is not None: raise InvalidCodePath def replace_with( # pragma: no cover self, node: NodeSource, clone: bool = False ) -> Self: raise InvalidCodePath
[docs] class ProcessingInstructionNode(_LeafNode, ProcessingInstructionNodeType): """ The instances of this class represent processing instruction nodes of a tree. This class implements :class:`delb.typing.ProcessingInstructionNodeType`. :param target: The processing instruction's target name. :param content: The processing instruction's text. """ __slots__ = ("__content", "__target") def __init__(self, target: str, content: str): super().__init__() self.content = content self.target = target def __eq__(self, other) -> bool: return ( isinstance(other, ProcessingInstructionNode) and self.target == other.target and self.content == other.content ) def __repr__(self) -> str: return ( f'<{self.__class__.__name__}("{self.target}", "{self.content}") ' f"[{hex(id(self))}]>" ) def __str__(self) -> str: return f"<?{self.target} {self.content}?>"
[docs] def clone(self, deep: bool = False) -> ProcessingInstructionNode: return ProcessingInstructionNode(self.__target, self.__content)
@property def content(self) -> str: return self.__content @content.setter def content(self, value: str): if value and not _is_xml_char(value): raise ValueError("Invalid XML character data.") if "?>" in value: raise ValueError("Content text must not contain '?>'.") self.__content = value @property def target(self) -> str: """ The processing instruction's target. :meta category: Node content properties """ return self.__target @target.setter def target(self, value: str): if not _is_xml_name(value): raise ValueError("Invalid target name.") if value.lower() == "xml": raise ValueError(f"{value} is a reserved target name.") self.__target = value
[docs] class TagNode(_ParentNode, TagNodeType): """ The instances of this class represent tag nodes of a tree, the equivalent of DOM's elements. This class implements :class:`delb.typing.TagNodeType`. :param local_name: The tag name. :param attributes: Optional attributes that are assigned to the new node. :param namespace: An optional tag namespace. :param children: An optional iterable of objects that will be appended as child nodes. This can be existing nodes, strings that will be inserted as text nodes and in-place definitions of :class:`TagNode` instances from :func:`tag`. The latter will be assigned to the same namespace. Some syntactic sugar is baked in: Attributes and nodes can be tested for membership in a node. >>> root = Document('<root ham="spam"><child/></root>').root >>> "ham" in root True >>> root.first_child in root True Nodes can be copied. Note that this relies on :meth:`TagNode.clone`. >>> from copy import copy, deepcopy >>> root = Document("<root>Content</root>").root >>> print(copy(root)) <root/> >>> print(deepcopy(root)) <root>Content</root> Attribute values and child nodes can be obtained, set and deleted with the subscript notation. >>> root = Document('<root x="y"><child_1/>child_2<child_3/></root>').root >>> print(root["x"]) y >>> print(root[0]) <child_1/> >>> print(root[-1]) <child_3/> >>> print([str(x) for x in root[1::-1]]) ['child_2', '<child_1/>'] How much child nodes has this node anyway? >>> root = Document("<root><child_1/><child_2/></root>").root >>> len(root) 2 >>> len(root[0]) 0 As seen in the examples above, a tag nodes string representation yields a serialized XML representation of a sub-/tree. See :doc:`/api/serialization` for details. """ __slots__ = ( "__attributes", "__local_name", "__namespace", ) def __init__( self, local_name: str, attributes: Optional[ _AttributesData | dict[AttributeAccessor, str] | TagAttributes ] = None, namespace: Optional[str] = None, children: Iterable[NodeSource] = (), ): self.namespace = namespace or "" self.local_name = local_name self.__attributes = TagAttributes(data=attributes or {}, node=self) super().__init__(children) def __contains__(self, item: AttributeAccessor | XMLNodeType) -> bool: match item: case str() | tuple(): return item in self.attributes case XMLNodeType(): return item in self._child_nodes case _: raise TypeError( "Argument must be a node instance or an attribute name. " + ATTRIBUTE_ACCESSOR_MSG ) def __delitem__(self, item: AttributeAccessor | int): match item: case str() | tuple(): del self.attributes[item] case int(): self[item].detach(retain_child_nodes=False) case slice(): if all( isinstance(x, int) or x is None for x in (item.start, item.stop) ): for node in self[item]: node.detach(retain_child_nodes=False) else: del self.attributes[(item.start, item.stop)] case _: raise TypeError( # TODO or a slice "Argument must be an integer or an attribute name. " + ATTRIBUTE_ACCESSOR_MSG ) @overload def __getitem__(self, item: int) -> XMLNodeType: ... @overload def __getitem__(self, item: AttributeAccessor) -> Attribute | None: ... def __getitem__(self, item): match item: case str() | tuple(): return self.attributes[item] case int(): if item < 0: item = len(self) + item for index, child_node in enumerate(self.iterate_children()): if index == item: return child_node raise IndexError("Node index out of range.") case slice() if all( (isinstance(x, int) or x is None) for x in (item.start, item.stop) ): return list(self.iterate_children())[item] raise TypeError( "Argument must be an integer as index for a child node, a " ":term:`slice` to grab an indexed range of nodes or an attribute " "name. " + ATTRIBUTE_ACCESSOR_MSG ) def __repr__(self) -> str: return ( f'<{self.__class__.__name__}("{self.universal_name}", ' f"{self.attributes}, {self.location_path}) [{hex(id(self))}]>" ) @overload def __setitem__(self, item: int, value: NodeSource): ... @overload def __setitem__(self, item: AttributeAccessor, value: str | Attribute): ... def __setitem__(self, item, value): match item: case str() | tuple(): self.attributes[item] = value case int(): children_size = len(self._child_nodes) if children_size == item: self._child_nodes.append(value) elif 0 <= item < children_size or ( item < 0 and abs(item) <= children_size ): self[item].replace_with(value) else: raise IndexError case _: raise TypeError( "Argument must be an integer or an attribute name. " + ATTRIBUTE_ACCESSOR_MSG )
[docs] def add_following_siblings( self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: if self._parent is None: raise InvalidOperation("Can't add sibling to a node without parent node.") return super().add_following_siblings(*node)
[docs] def add_preceding_siblings( self, *node: NodeSource, clone: bool = False ) -> tuple[XMLNodeType, ...]: if self._parent is None: raise InvalidOperation("Can't add sibling to a node without parent node.") return super().add_preceding_siblings(*node)
@property def attributes(self) -> TagAttributes: """ A :term:`mapping` that can be used to access the node's attributes. :meta category: Node content properties >>> node = TagNode("node", attributes={"foo": "0", "bar": "0"}) >>> node.attributes {'foo': '0', 'bar': '0'} >>> node.attributes.pop("bar") # doctest: +ELLIPSIS <Attribute(bar="0") [0x...]> >>> node.attributes["foo"] = "1" >>> node.attributes["peng"] = "1" >>> print(node) <node foo="1" peng="1"/> >>> node.attributes.update({"foo": "2", "zong": "2"}) >>> print(node) <node foo="2" peng="1" zong="2"/> Namespaced names are accessed with two-value tuples or a string. The two-value holds the namespace and the local name in that order. A string can either be a fully qualified name in `Clark notation`_ or a local name that belongs to the containing node's namespace. .. _Clark notation: http://www.jclark.com/xml/xmlns.htm >>> DefaultStringOptions.namespaces = {"": "http://namespace"} >>> node = TagNode( ... "node", ... namespace="http://namespace", ... ) >>> node.attributes.update({("http://namespace", "foo"): "0"}) >>> print(node) <node xmlns="http://namespace" foo="0"/> >>> attribute = node.attributes[("http://namespace", "foo")] >>> node.attributes["foo"] is attribute True >>> node.attributes["{http://namespace}foo"] is attribute True Attributes behave like strings, but also expose namespace, local name and value for manipulation. >>> node = TagNode("node") >>> node.attributes["foo"] = "0" >>> node.attributes["foo"].local_name = "bar" >>> node.attributes["bar"].namespace = "http://namespace" >>> node.attributes[("http://namespace", "bar")].value = "X" >>> print(node) <node xmlns:ns0="http://namespace" ns0:bar="X"/> >>> "ref-" + node.attributes[("http://namespace", "bar")].lower() 'ref-x' """ return self.__attributes
[docs] def clone(self, deep: bool = False) -> TagNodeType: result = TagNode( local_name=self.__local_name, namespace=self.__namespace, attributes=self.attributes, ) if deep: result.append_children(*(n.clone(deep=True) for n in self._child_nodes)) return result
[docs] def css_select( self, expression: str, namespaces: Optional[NamespaceDeclarations] = None ) -> QueryResults: """ Queries the tree with a CSS selector expression with this node as initial context node. :param expression: A CSS selector expression. :param namespaces: A mapping of prefixes that are used in the expression to namespaces. If not provided the node's namespace will serve as default, mapped to an empty prefix. :return: All nodes that match the evaluation of the provided CSS selector expression. :meta category: Methods to query the tree See :doc:`/api/querying` regarding the extent of the supported grammar. Namespace prefixes are delimited with a ``|`` before a name test, for example ``div svg|metadata`` selects all descendants of ``div`` named nodes that belong to the default namespace or have no namespace and whose name is ``metadata`` and have a namespace that is mapped to the ``svg`` prefix. """ return self.xpath(expression=_css_to_xpath(expression), namespaces=namespaces)
[docs] def detach(self, retain_child_nodes: bool = False) -> Self: if isinstance(self._parent, _DocumentNode): raise InvalidOperation("The root node of a document cannot be detached.") if self._parent is None: if retain_child_nodes: raise InvalidOperation( "Child nodes can't be retained when the node to detach has no " "parent node." ) return self index = self._parent._child_nodes.index(self) if retain_child_nodes: children = tuple(self._child_nodes) self._child_nodes.clear() self._parent.insert_children(index, *children) self._parent._child_nodes.remove(self) return self
@property def document(self) -> Optional[Document]: document_node = last(self._iterate_ancestors(_include_document_node=True)) if isinstance(document_node, _DocumentNode): return document_node.document else: return None
[docs] def fetch_or_create_by_xpath( self, expression: str, namespaces: Optional[NamespaceDeclarations] = None, ) -> TagNodeType: ast = parse_xpath(expression) if not ast._is_unambiguously_locatable: raise ValueError( "The XPath expression doesn't determine a distinct branch." ) query_result = self.xpath(expression, namespaces=namespaces) if query_result.size == 1: result = query_result.first assert isinstance(result, TagNode) return result if query_result: raise AmbiguousTreeError( f"The tree already contains {query_result.size} matching branches." ) return self._create_by_xpath( ast=ast, namespaces=Namespaces(namespaces or Namespaces({"": self.namespace})), )
def _create_by_xpath( self, ast: XPathExpression, namespaces: Namespaces, ) -> TagNode: node: _ParentNode if ast.location_paths[0].absolute: match root := last(self._iterate_ancestors(_include_document_node=True)): case _DocumentNode(): node = root case TagNode(): node = _DocumentNode(None, (root,)) case None: node = _DocumentNode(None, (self,)) else: node = self for i, step in enumerate(ast.location_paths[0].location_steps): candidates = tuple(step.evaluate(node_set=(node,), namespaces=namespaces)) match len(candidates): case 0: node_test = step.node_test assert isinstance(node, TagNode) assert isinstance(node_test, NameMatchTest) new_node = TagNode( local_name=node_test.local_name, attributes=None, namespace=namespaces.get(node_test.prefix), ) for prefix, local_name, value in step._derived_attributes: new_node.attributes[ (namespaces.get(prefix) or "", local_name) ] = value node.append_children(new_node) node = new_node case 1: node = cast("TagNode", candidates[0]) case _: raise AmbiguousTreeError( f"The tree has multiple possible branches at location step {i}." ) assert isinstance(node, TagNode) return node def _get_normalize_space_directive( self, default: Literal["default", "preserve"] = "default" ) -> Literal["default", "preserve"]: if (attribute := self.attributes.get((XML_NAMESPACE, "space"))) is None: return default if attribute in ("default", "preserve"): return attribute warnings.warn( "Encountered and ignoring an invalid `xml:space` attribute: " + attribute.value, category=UserWarning, ) return default @property def id(self) -> Optional[str]: return self.attributes.get((XML_NAMESPACE, "id")) @id.setter def id(self, value: Optional[str]): match value: case None: del self.attributes[(XML_NAMESPACE, "id")] case str(): if not _is_xml_name(value): raise ValueError("Value is not a valid xml name.") root = cast("TagNode", last(self._iterate_ancestors())) or self for node in chain((root,), root._iterate_descendants()): if not isinstance(node, TagNode): continue if node.attributes.get((XML_NAMESPACE, "id"), "") == value: raise ValueError( "An xml:id-attribute with that value is already assigned " "in the tree." ) self.attributes[(XML_NAMESPACE, "id")] = value case _: raise TypeError("Value must be None or a string.") @property def local_name(self) -> str: return self.__local_name @local_name.setter def local_name(self, value: str): if not _is_xml_name(value): raise ValueError("Value is not a valid xml name.") self.__local_name = value @property def location_path(self) -> str: if not isinstance(self._parent, TagNode): return "/*" steps: list[XMLNodeType] = list(self._iterate_ancestors()) steps.pop() # root steps.reverse() steps.append(self) with altered_default_filters(is_tag_node): # to affect the .index value return "/*" + "".join(f"/*[{cast('int', n.index)+1}]" for n in steps) @property def namespace(self) -> str: """ The node's namespace. An empty string represents an empty namespace. :meta category: Node properties """ return self.__namespace @namespace.setter def namespace(self, value: str): # TODO see https://github.com/delb-xml/delb-py/issues/69 if value and not _is_xml_char(value): raise ValueError("Invalid XML character data.") self.__namespace = value def _new_tag_node_from_definition(self, definition: _TagDefinition) -> TagNode: return TagNode( local_name=definition.local_name, attributes=definition.attributes, namespace=self.namespace, children=definition.children, )
[docs] @staticmethod def parse(text, parser_options): # pragma: no cover # REMOVE with version 0.7 """This method has been replaced by :func:`delb.parse_tree`.""" raise InvalidOperation( "This method has been replaced by `delb.parse_tree`.", )
def _reduce_whitespace( self, normalize_space: Literal["default", "preserve"] = "default" ): self._reduce_whitespace_of_descendants(normalize_space) def _reduce_whitespace_of_descendants( self, normalize_space: Literal["default", "preserve"] ): if not (child_nodes := self._child_nodes): return self.merge_text_nodes(deep=False) if ( normalize_space := self._get_normalize_space_directive(normalize_space) ) == "default": assert isinstance(child_nodes, Siblings) _reduce_whitespace_between_siblings(child_nodes) for child_node in (n for n in child_nodes if isinstance(n, TagNode)): child_node._reduce_whitespace_of_descendants(normalize_space)
[docs] def serialize( self, *, format_options: Optional[FormatOptions] = None, namespaces: Optional[NamespaceDeclarations] = None, newline: Optional[str] = None, ) -> str: serializer = _get_serializer( _StringWriter(newline=newline), format_options=format_options, namespaces=namespaces, ) serializer.serialize_root(self) return serializer.writer.result
@property def universal_name(self) -> str: return "{" + self.__namespace + "}" + self.__local_name
[docs] class TextNode(_LeafNode, _StringMixin, TextNodeType): # type: ignore """ TextNodes contain the textual data of a document. The class shall not be initialized by client code, just throw strings into the trees. This class implements :class:`delb.typing.TextNodeType`. Instances expose all methods of :class:`str` except :meth:`str.index`: >>> node = TextNode("Show us the way to the next whisky bar.") >>> node.split() ['Show', 'us', 'the', 'way', 'to', 'the', 'next', 'whisky', 'bar.'] Instances can be tested for inequality with other text nodes and strings: >>> TextNode("ham") == TextNode("spam") False >>> TextNode("Patsy") == "Patsy" True And they can be tested for substrings: >>> "Sir" in TextNode("Sir Bedevere the Wise") True Attributes that rely to child nodes yield nothing respectively :obj:`None`. """ __slots__ = ("__content",) def __init__( self, text: str | TextNode, ): super().__init__() match text: case str(): self.content = text case TextNode(): self.content = text.__content case _: raise TypeError def __eq__(self, other): if isinstance(other, TextNode): return self.__content == other.content else: return super().__eq__(other) def __getitem__(self, item): return self.content[item] def __len__(self): return len(self.__content) def __repr__(self): return f'<{self.__class__.__name__}(text="{self.content}", [{hex(id(self))}]>' def __str__(self): return self.__content
[docs] def clone(self, deep: bool = False) -> TextNodeType: return TextNode(self.__content)
@property def content(self) -> str: return self.__content @content.setter def content(self, text: str): if not isinstance(text, str): raise TypeError self.__content = text @property def full_text(self) -> str: return self.__content
# __all__ = ( Attribute.__name__, CommentNode.__name__, ProcessingInstructionNode.__name__, QueryResults.__name__, Siblings.__name__, TagAttributes.__name__, TagNode.__name__, TextNode.__name__, )