Source code for _delb.xpath

# Copyright (C) 2018-'22  Frank Sachsenheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
*delb* allows querying of nodes with CSS selector and XPath expressions. CSS selectors
are converted to XPath expressions with a third-party library before evaluation and they
are only supported as far as their computed XPath equivalents are supported by *delb*'s
very own XPath implementation.

This implementation is not fully compliant with one of the W3C's XPath specifications.
It mostly covers the `XPath 1.0 specs`_, but focuses on the querying via path
expressions with simple constraints while it omits a broad employment of  computations
(that's what programming languages are for) and has therefore these intended deviations
from that standard:

- Default namespaces can be addressed in node and attribute names, by simply using no
  prefix.
- The attribute and namespace axes are not supported in location steps (see also below).
- In predicates only the attribute axis can be used in its abbreviated form (``@name``).
- Path evaluations within predicates are not available.
- Only these predicate functions are provided and tested:
    - ``boolean``
    - ``concat``
    - ``contains``
    - ``last``
    - ``not``
    - ``position``
    - ``starts-with``
    - ``text``
        - Behaves as if deployed as a single step location path that only tests for the
          node type *text*. Hence it returns the contents of the context node's first
          child node that is a text node or an empty string when there is none.
    - Please refrain from extension requests without a proper, concrete implementation
      proposal.

If you're accustomed to retrieve attribute values with XPath expressions, employ the
functionality of the higher programming language at hand like this:

    >>> [x.attributes["target"] for x in root.xpath(".//foo")
    ...  if "target" in x.attributes ]  # doctest: +SKIP

Instead of:

    >>> root.xpath(".//foo/@target")  # doctest: +SKIP

See :meth:`_delb.plugins.PluginManager.register_xpath_function` regarding the use of
custom functions.

.. _XPath 1.0 specs: https://www.w3.org/TR/1999/REC-xpath-19991116/
"""

from __future__ import annotations

from functools import lru_cache
from typing import TYPE_CHECKING, Optional

# DROPWITH Python 3.8 and replace w/ imports from collections.abc
from typing import Collection, Iterable, Iterator, Mapping, Sequence

from cssselect import GenericTranslator

from _delb.names import Namespaces
from _delb.utils import sort_nodes_in_document_order
from _delb.xpath.ast import EvaluationContext
from _delb.xpath import functions  # noqa: F401
from _delb.xpath.parser import parse


if TYPE_CHECKING:
    from _delb.nodes import NodeBase
    from _delb.typing import Filter, NamespaceDeclarations

_css_translator = GenericTranslator()


[docs]class QueryResults(Sequence["NodeBase"]):
    """
    A container that includes the results of a CSS selector or XPath query with some
    helpers for better readable Python expressions.
    """

    def __init__(self, results: Iterable[NodeBase]):
        self.__items = tuple(results)

    def __eq__(self, other):
        if not isinstance(other, Collection):
            raise TypeError

        return len(self.__items) == len(other) and all(x in other for x in self.__items)

    def __getitem__(self, item):
        return self.__items[item]

    def __len__(self) -> int:
        return len(self.__items)

    def __repr__(self):
        return str([repr(x) for x in self.__items])

[docs]    def as_list(self) -> list[NodeBase]:
        """The contained nodes as a new :class:`list`."""
        return list(self.__items)

    @property
    def as_tuple(self) -> tuple[NodeBase, ...]:
        """The contained nodes in a :class:`tuple`."""
        return self.__items

[docs]    def filtered_by(self, *filters: Filter) -> QueryResults:
        """
        Returns another :class:`QueryResults` instance that contains all nodes filtered
        by the provided :term:`filter` s.
        """
        items: Sequence[NodeBase] = self.__items
        for filter in filters:
            items = [x for x in items if filter(x)]
        return self.__class__(items)

    @property
    def first(self) -> Optional[NodeBase]:
        """The first node from the results or :py:obj:`None` if there are none."""
        if len(self.__items):
            return self.__items[0]
        else:
            return None

[docs]    def in_document_order(self) -> QueryResults:
        """
        Returns another :class:`QueryResults` instance where the contained nodes are
        sorted in document order.
        """
        return QueryResults(sort_nodes_in_document_order(self))

    @property
    def last(self) -> Optional[NodeBase]:
        """The last node from the results or :py:obj:`None` if there are none."""
        if len(self.__items):
            return self.__items[-1]
        else:
            return None

    @property
    def size(self) -> int:
        """The amount of contained nodes."""
        return len(self.__items)


# TODO make cachesize configurable via environment variable?
@lru_cache(maxsize=64)
def _css_to_xpath(expression: str) -> str:
    return _css_translator.css_to_xpath(expression, prefix="descendant::")


def evaluate(
    node: NodeBase,
    expression: str,
    namespaces: Optional[NamespaceDeclarations] = None,
) -> QueryResults:
    # global namespaces are guaranteed by the Namespaces implementation
    if namespaces is None:
        _namespaces = node.namespaces
    elif isinstance(namespaces, Namespaces):
        # b/c it would break fallback chains
        raise TypeError
    elif isinstance(namespaces, Mapping):
        _namespaces = Namespaces(namespaces, fallback=node.namespaces)
    else:
        raise TypeError

    return QueryResults(parse(expression).evaluate(node=node, namespaces=_namespaces))


__all__ = (
    _css_to_xpath.__name__,  # type:ignore
    evaluate.__name__,
    parse.__name__,  # type: ignore
    EvaluationContext.__name__,
    QueryResults.__name__,
)


# REMOVE eventually

#  L E G A C Y  #


""" This was neither an XPath implementation. """


def _reduce_whitespace(expression: str) -> str:
    """
    Remove unnecessary whitespace from xpath predicate expression.

    >>> _reduce_whitespace('[@a = "1" or  @b = "2"][@c = "3"]')
    '[@a="1" or @b="2"][@c="3"]'

    >>> _reduce_whitespace('[contains(@a, "1")]')
    '[contains(@a,"1")]'

    """
    quote = ""
    result = ""
    skip = 0

    for i, character in enumerate(expression):
        if skip:
            skip -= 1
        elif character == " " and not quote:
            if expression[i : i + 4] == " or ":
                result += " or "
                skip = 3
            elif expression[i : i + 5] == " and ":
                result += " and "
                skip = 4
            else:
                pass  # result += ""
        elif character in ("'", '"'):
            quote = "" if quote else character
            result += character
        else:
            result += character

    return result


def _split(expression: str, separator: str) -> Iterator[str]:  # pragma: no cover
    """
    Split expression at occurrences of specified seperator, except
    where within quotation marks.

    >>> list(_split('./root/path[@a="n/a"]', '/'))
    ['.', 'root', 'path[@a="n/a"]']

    >>> list(_split('@type="translation" and @xml:lang="en"', ' and '))
    ['@type="translation"', '@xml:lang="en"']

    """
    assert separator not in ('"', "'")
    cursor = 0
    part = ""
    quote = ""
    separator_length = len(separator)

    for i, character in enumerate(expression):
        if i < cursor:
            continue

        if expression[i : i + separator_length] == separator and not quote:
            yield part
            part = ""
            cursor += separator_length
            continue

        if character == quote:
            quote = ""
        elif character in ('"', "'"):
            quote = character

        part += character
        cursor += 1

    yield part


class LegacyXPathExpression:
    __slots__ = ("location_paths",)

    def __init__(self, expression: str):
        self.location_paths = [
            LegacyLocationPath(x) for x in _split(_reduce_whitespace(expression), "|")
        ]

    def __str__(self):
        return " | ".join(str(x) for x in self.location_paths)


class LegacyLocationPath:
    __slots__ = ("location_steps",)

    def __init__(self, expression: str):
        step_expressions = list(_split(expression, "/"))
        if not step_expressions[0]:
            step_expressions = step_expressions[1:]
        self.location_steps = [LegacyLocationStep(x) for x in step_expressions]

    def __str__(self):
        return "/".join(str(x) for x in self.location_steps)


class LegacyLocationStep:  # pragma: no cover
    __slots__ = ("axis", "node_test", "predicates")

    def __init__(self, expression: str):
        self.axis: str
        self.node_test: LegacyNodeTest
        self.predicates = ""

        if expression == "":
            self.axis = "descendant-or-self"
            self.node_test = LegacyNodeTest("node()")
        elif expression == ".":
            self.axis = "self"
            self.node_test = LegacyNodeTest("node()")
        elif expression == "..":
            self.axis = "parent"
            self.node_test = LegacyNodeTest("node()")
        else:
            if "[" in expression:
                expression, predicates_part = expression.split("[", maxsplit=1)
                assert predicates_part[-1] == "]", predicates_part
                self.predicates = "[" + predicates_part

            if "::" not in expression:
                self.axis = "child"
                self.node_test = LegacyNodeTest(expression)
            else:
                self.axis, node_test_part = expression.split("::")
                self.node_test = LegacyNodeTest(node_test_part)

    def __str__(self):
        return self.axis + "::" + self.node_test.data + self.predicates


class LegacyNodeTest:
    __slots__ = ("data", "type")

    def __init__(self, expression: str):
        self.data = expression

        if expression.endswith(")"):
            self.type = "type_test"
        else:
            self.type = "name_test"
Source code for _delb.xpath

Table of Contents

Related Topics