Source code for _delb.parser

# Copyright (C) 2018-'25  Frank Sachsenheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations

import codecs
import re
import warnings
from enum import IntEnum, auto
from io import BytesIO
from typing import Final, TYPE_CHECKING, NamedTuple, Optional, TypeAlias, cast

from _delb.plugins import plugin_manager

if TYPE_CHECKING:
    from collections.abc import Iterator, Sequence

    from _delb.plugins import XMLEventParserInterface
    from _delb.typing import BinaryReader, InputStream, _AttributesData


BOM_TO_ENCODING_NAME: Final = (
    (4, codecs.BOM_UTF32_LE, "utf-32-le"),
    (4, codecs.BOM_UTF32_BE, "utf-32-be"),
    (3, codecs.BOM_UTF8, "utf-8"),
    (2, codecs.BOM_UTF16_LE, "utf-16-le"),
    (2, codecs.BOM_UTF16_BE, "utf-16-be"),
)


_match_encoding: Final = re.compile(
    rb"""<\?xml\sversion=["']1\.0["']\sencoding=["']([A-Za-z0-9_-]+)["']\?>"""
).match


class _EncodingDetectingReader:
    __slots__ = ("buffer", "first_bytes", "reading")

    def __init__(self, buffer: BinaryReader):
        self.buffer: Final = buffer
        self.first_bytes = b""
        self.reading = False

    def get_encoding(self) -> str | None:
        if self.reading:  # pragma: no cover
            raise RuntimeError("Get the encoding before reading from the buffer!")

        self.first_bytes = self.buffer.read(64)
        return detect_encoding(self.first_bytes)

    def read(self, n: int = -1) -> bytes:
        if self.reading:
            return self.buffer.read(n)
        else:
            self.reading = True
            return self.first_bytes + self.buffer.read(n)



[docs]
class EventType(IntEnum):
    Comment = auto()
    ProcessingInstruction = auto()
    TagStart = auto()
    TagEnd = auto()
    Text = auto()




[docs]
class ParserOptions(NamedTuple):
    """
    The configuration options that define an XML parser's behaviour.

    The used parser backend is determined by their availability and the
    ``preferred_parsers`` setting.  *delb* comes with two contributed implementations
    and further can be added to the plugin manager based on
    :class:`_delb.plugins.XMLEventParserInterface`.

    Both contributed implementations should not be tasked with documents that refer
    invalid *Document Type Declarations* (DTDs), such may pass when their included
    character entity declarations aren't used in the character data of the document or
    lead to errors of different degrees of severity.  Character entity declarations are
    the only considered DTD feature to provide backward compatibility.

    Both will not allow some non-word characters as part of XML names that should be
    allowed with the 5th edition of the XML 1.0 specification, e.g. ``:`` or single
    combining characters.

    Beside the :exc:`_delb.exceptions.ParsingError` exception and its derivations the
    employed parsers may evoke their specific exceptions when confronted with invalid
    syntax and not-so-well-formed documents.

    The ``expat`` parser adapter depends on the :mod:`xml.sax.expatreader` module from
    the standard library that is available with many Python distributions.

    The ``lxml`` based parser requires the *lxml* package to be present in the
    interpreter environment.  This parser is prone to crashing when processing invalid
    DTDs, it also fails with uncommon, but still valid by spec, DTD contents.  It
    should not be used with other encodings than Unicode to avoid crashes. Neither
    should it be used in conjunction with the `load_referenced_resources` when
    processing larger files / trees.
    """  # noqa: RST304

    encoding: Optional[str] = None
    """
    This should be used for streams where the encoding is not noted in an XML document
    declaration or indicated by a BOM for Unicode encodings.  It doesn't affect parsing
    of data that is passed as :class:`str`.  Default: :obj:`None`.
    """
    load_referenced_resources: bool = False
    """Allows the loading of referenced external DTDs.  Default: :obj:`False`."""
    preferred_parsers: str | Sequence[str] = ("lxml", "expat")
    """
    A parser adapter name or a sequence of such that are preferably to be used.
    Default: ``("lxml", "expat")``.
    """
    reduce_whitespace: bool = False
    """
    :meth:`Reduce the content's whitespace <delb.Document.reduce_whitespace>`.
    Default: :obj:`False`.
    """
    remove_comments: bool = False
    """Ignore comments.  Default: :obj:`False`."""
    remove_processing_instructions: bool = False
    """
    Don't include processing instructions in the parsed tree.  Default: :obj:`False`.
    """
    unplugged: bool = False
    """Don't load referenced resources over network.  Default: :obj:`False`."""




[docs]
class TagEventData(NamedTuple):
    namespace: str
    local_name: str
    attributes: _AttributesData
    """
    The attributes data must not contain XML namespace declarations.
    It is optional in case of a :py:enum:`EventType.TagEnd`.
    """



Event: TypeAlias = tuple[EventType, str | tuple[str, str] | TagEventData | None]
"""
An XML stream event tuple consists of two values.  The first is a member of
:class:`EventType` that signals the type of event, the second carries the relevant data.
All data must be stripped of XML markup characters and character data must be completely
parsed and normalized.  All XML names and character entities must be resolved.

.. list-table:: XML event tuples' structure
    :widths: auto

    * - Event member
      - Data type
      - Notes
    * - :py:enum:member:`EventType.Comment`
      - :class:`str`
      -
    * - :py:enum:member:`EventType.ProcessingInstruction`
      - :class:`tuple` [:class:`str`, :class:`str`]
      - ``(target, content)``
    * - :py:enum:member:`EventType.TagStart`
      - :class:`TagEventData`
      -
    * - :py:enum:member:`EventType.TagEnd`
      - :class:`TagEventData` | :class:`None`
      - If data is provided, the tree builder can detect inconsistent tagging in debug
        mode.
    * - :py:enum:member:`EventType.Text`
      - :class:`str`
      -
"""


def detect_encoding(stream: bytes) -> str | None:
    if (match := _match_encoding(stream)) is not None:
        return match.group(1).decode("ascii")
    else:
        for bom_size, bom, name in BOM_TO_ENCODING_NAME:
            if stream[:bom_size] == bom:
                return name
        else:
            return None


def _make_parser(
    options: ParserOptions, *, base_url: str | None, encoding: str
) -> XMLEventParserInterface:
    return plugin_manager.get_parser(options.preferred_parsers)(
        options, base_url=base_url, encoding=encoding
    )


def parse_events(
    input_: InputStream, options: ParserOptions, base_url: str | None
) -> Iterator[Event]:
    encoding = options.encoding
    if isinstance(input_, str):
        encoding = "utf-8"

    elif isinstance(input_, bytes):
        if encoding is None:
            encoding = detect_encoding(input_)
        input_ = BytesIO(input_)

    elif encoding is None:
        if input_.seekable():
            encoding = detect_encoding(input_.read(64))
            input_.seek(0)
        else:
            input_ = _EncodingDetectingReader(input_)
            encoding = input_.get_encoding()

    if encoding is None:
        warnings.warn(
            "No encoding known for parsing an XML stream. Defaulting to UTF-8.",
            category=UserWarning,
        )
        encoding = "utf-8"

    yield from _make_parser(options, base_url=base_url, encoding=encoding).parse(
        cast("BinaryReader", input_)
    )


__all__ = (
    "Event",
    "EventType",
    ParserOptions.__name__,
    TagEventData.__name__,
    detect_encoding.__name__,
)