Source code for _delb.plugins

# Copyright (C) 2018-'25  Frank Sachsenheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import Iterable, Iterator, Sequence
from importlib.metadata import entry_points
from importlib.util import find_spec
from typing import TYPE_CHECKING, Any, Final, overload

if TYPE_CHECKING:
    from types import SimpleNamespace
    from delb import Document
    from _delb.parser import Event, ParserOptions
    from _delb.typing import (
        BinaryReader,
        GenericDecorated,
        Loader,
        LoaderConstraint,
        SecondOrderDecorator,
        XPathFunction,
    )


[docs] class DocumentMixinBase: """ By deriving a subclass from this one, a document extension class is registered as plugin. These are supposed to add additional attributes to a document, e.g. derived data or methods to interact with storage systems. All attributes of an extension should share a common prefix that terminates with an underscore, e.g. `storage_load`, `storage_save`, etc. This base class also acts as termination for methods that can be implemented by mixin classes. Any implementation of a method must call a base class' one, e.g.: .. code-block:: from types import SimpleNamespace from _delb.plugins import DocumentMixinBase from magic_wonderland import play_disco class MyExtension(DocumentMixinBase): # this method can be implemented by any extension class @classmethod def _init_config(cls, config, kwargs): config.my_extension = SimpleNamespace(tonality=kwargs.pop( "my_extension_tonality") ) super()._init_config(config, kwargs) # this method is specific to this extension def my_extension_makes_magic(self): play_disco(self.config.my_extension.tonality) """ def __init_subclass__(cls): # ensure it is a direct subclass if cls.__mro__[1] is DocumentMixinBase: plugin_manager.document_mixins.append(cls)
[docs] @classmethod def _init_config(cls, config: SimpleNamespace, kwargs: dict[str, Any]): """ The ``kwargs`` argument contains the additional keyword arguments that a :class:`delb.Document` instance is called with. Extension classes that expect configuration data *must* process their specific arguments by clearing them from the ``kwargs`` dictionary, e.g. with :meth:`dict.pop`, and preferably storing the final configuration data in a :class:`types.SimpleNamespace` and adding it to the :class:`types.SimpleNamespace` passed as ``config`` with the extension's name. The initially mentioned keyword arguments *should* be prefixed with that name as well. This method is called before the loaders try to read and parse the given source for a document. """ if kwargs: raise RuntimeError( "Not all configuration arguments have been processed. You either " "passed invalid arguments or an extension doesn't handle them " f"properly: {kwargs}" )
class PluginManager: __slots__ = ( "document_mixins", "document_subclasses", "loaders", "parsers", "xpath_functions", ) def __init__(self): self.document_mixins: Final[list[type[DocumentMixinBase]]] = [] self.document_subclasses: Final[list[type[Document]]] = [] self.loaders: Final[list[Loader]] = [] self.parsers: Final[dict[str, type[XMLEventParserInterface]]] = {} self.xpath_functions: Final[dict[str, XPathFunction]] = {} def get_parser( self, preferences: str | Sequence[str] ) -> type[XMLEventParserInterface]: if isinstance(preferences, str): preferences = (preferences,) for name in preferences: if (parser := self.parsers.get(name)) is not None: return parser else: raise ValueError(f"No matching parser for {preferences} available.") @staticmethod def load_plugins(): """ Loads all modules that are registered as entrypoint in the ``delb`` group and imports contributed extensions whose dependencies are available. """ if find_spec("httpx"): import _delb.plugins.web_loader if find_spec("lxml"): import _delb.plugins.lxml_parser if find_spec("pyexpat"): import _delb.plugins.expat_parser # noqa: F401 for entrypoint in entry_points().select(group="delb"): entrypoint.load() def register_loader( self, before: LoaderConstraint = None, after: LoaderConstraint = None ) -> SecondOrderDecorator: """ Registers a document loader. An example module that is specified as ``delb`` plugin for an IPFS loader might look like this: .. testcode:: from os import getenv from types import SimpleNamespace from typing import Any from _delb.plugins import plugin_manager from _delb.plugins.web_loader import web_loader from _delb.typing import LoaderResult IPFS_GATEWAY = getenv("IPFS_GATEWAY_PREFIX", "https://ipfs.io/ipfs/") @plugin_manager.register_loader() def ipfs_loader(source: Any, config: SimpleNamespace) -> LoaderResult: if isinstance(source, str) and source.startswith("ipfs://"): config.source_url = source config.ipfs_gateway_source_url = IPFS_GATEWAY + source[7:] return web_loader(config.ipfs_gateway_source_url, config) # return an indication why this loader didn't attempt to load in order # to support debugging return "The input value is not an URL with the ipfs scheme." The ``source`` argument is what a :class:`delb.Document` instance is initialized with as input data. Note that the ``config`` argument that is passed to a loader function contains configuration data, it's the :attr:`delb.Document.config` property after :meth:`_init_config <_delb.plugins.DocumentMixinHooks._init_config>` has been processed. Loaders that retrieve a document from an URL should add the origin as string to the ``config`` object as ``source_url``. You might want to specify a loader to be considered before or after another one. Let's assume a loader shall figure out what to load from a remote XML resource that contains a reference to the actual document. That one would have to be considered before the one that loads XML documents from a URL with the `https` scheme: .. testcode:: from _delb.plugins import plugin_manager from _delb.plugins.web_loader import web_loader @plugin_manager.register_loader(before=web_loader) def mets_loader(source, config) -> LoaderResult: # loading logic here pass """ if before is not None and after is not None: raise NotImplementedError( "Loaders may only define one constraint atm. Please open an issue with " "a use-case description if you need to define both." ) registered_loaders = self.loaders if before is not None: if not isinstance(before, Iterable): before = (before,) index = min(registered_loaders.index(x) for x in before) elif after is not None: if not isinstance(after, Iterable): after = (after,) index = max(registered_loaders.index(x) for x in after) + 1 else: index = len(registered_loaders) def registrar(loader: Loader) -> Loader: assert callable(loader) registered_loaders.insert(index, loader) return loader return registrar @overload def register_xpath_function(self, arg: str) -> SecondOrderDecorator: ... @overload def register_xpath_function(self, arg: GenericDecorated) -> GenericDecorated: ... def register_xpath_function( self, arg: str | GenericDecorated ) -> SecondOrderDecorator | GenericDecorated: """ Custom XPath functions can be defined as shown in the following example. The first argument to a function is always an instance of :class:`_delb.xpath.EvaluationContext` followed by the expression's arguments. .. testcode:: from delb import Document from _delb.plugins import plugin_manager from _delb.xpath import EvaluationContext @plugin_manager.register_xpath_function("is-last") def is_last(context: EvaluationContext) -> bool: return context.position == context.size @plugin_manager.register_xpath_function def lowercase(_, string: str) -> str: return string.lower() document = Document("<root><node/><node foo='BAR'/></root>") print(document.xpath("//*[is-last() and lowercase(@foo)='bar']").first) .. testoutput:: <node foo="BAR"/> """ if isinstance(arg, str): def wrapper(func: XPathFunction) -> XPathFunction: self.xpath_functions[arg] = func return func return wrapper if callable(arg): self.xpath_functions[arg.__name__] = arg return arg
[docs] class XMLEventParserInterface(ABC): """ This is the base class for parser adapters. After initialization their :meth:`parse` method will be called for iterate over parser events. Instances don't have to care about their state beyond the parsing of one input stream as they're only employed once. :param options: The parsing options the user passed with the input stream. :param base_url: The base URL for resolving references. :param encoding: This is the encoding that was either provided by the user, noted in an XML document declaration or indicated by a Byte Order Mark. But it could also be the fallback value ``utf-8`` if none of the prior was available. """ name: str """ The parser can be selected by this class attribute's value as (member of) a :attr:`delb.parser.ParserOptions.preferred_parsers` setting. """ def __init_subclass__(cls): plugin_manager.parsers[cls.name] = cls @abstractmethod def __init__(self, options: ParserOptions, base_url: str | None, encoding: str): pass
[docs] @abstractmethod def parse(self, data: BinaryReader | str) -> Iterator[Event]: """ This method must be implemented and yield the parsed contents in document order as :obj:`Event` tuples. """ pass
plugin_manager = PluginManager() __all__ = (DocumentMixinBase.__name__, "plugin_manager")